diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140803 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998401789995205, + "eval_steps": 500, + "global_step": 4692, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1327.1429443359375, + "completions/mean_terminated_length": 815.3893432617188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.00021309466730595067, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11880554905295186, + "kl": 0.0005807876586914062, + "learning_rate": 0.0, + "loss": 0.0525, + "num_tokens": 659824.0, + "reward": 0.590401828289032, + "reward_std": 0.417092889547348, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1439732164144516, + "rewards/tag_count_reward/std": 0.18524260818958282, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1581.8817138671875, + "completions/mean_terminated_length": 1039.202880859375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.00042618933461190133, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.9835778773808289, + "kl": 0.0007829666137695312, + "learning_rate": 2.127659574468085e-09, + "loss": 0.0479, + "num_tokens": 1443227.0, + "reward": 0.51953125, + "reward_std": 0.3486446738243103, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1244419664144516, + "rewards/tag_count_reward/std": 0.15507708489894867, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1447.96435546875, + "completions/mean_terminated_length": 932.5809326171875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.000639284001917852, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10499796417370877, + "kl": 0.0006656646728515625, + "learning_rate": 4.25531914893617e-09, + "loss": 0.0887, + "num_tokens": 2169131.0, + "reward": 0.5764509439468384, + "reward_std": 0.427979052066803, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.17117878794670105, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1508.450927734375, + "completions/mean_terminated_length": 913.1737060546875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0008523786692238027, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1018359279083423, + "kl": 0.0005950927734375, + "learning_rate": 6.382978723404255e-09, + "loss": 0.0928, + "num_tokens": 2914437.0, + "reward": 0.431919664144516, + "reward_std": 0.36637213826179504, + "rewards/accuracy_reward/mean": 0.2946428656578064, + "rewards/accuracy_reward/std": 0.45639166235923767, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1372767835855484, + "rewards/tag_count_reward/std": 0.18277305364608765, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1506.0626220703125, + "completions/mean_terminated_length": 863.6682739257812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.0010654733365297534, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.21057650648045662, + "kl": 0.0006418228149414062, + "learning_rate": 8.51063829787234e-09, + "loss": 0.0709, + "num_tokens": 3657233.0, + "reward": 0.4151785969734192, + "reward_std": 0.30718994140625, + "rewards/accuracy_reward/mean": 0.2857142984867096, + "rewards/accuracy_reward/std": 0.45225897431373596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1294642835855484, + "rewards/tag_count_reward/std": 0.18765640258789062, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1621.5157470703125, + "completions/mean_terminated_length": 846.3333129882812, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.001278568003835704, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09960264735253337, + "kl": 0.0006780624389648438, + "learning_rate": 1.0638297872340425e-08, + "loss": 0.0792, + "num_tokens": 4455016.0, + "reward": 0.3738839328289032, + "reward_std": 0.3686589002609253, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44215917587280273, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1082589253783226, + "rewards/tag_count_reward/std": 0.16469375789165497, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1571.7098388671875, + "completions/mean_terminated_length": 835.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0014916626711416548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10830495262519961, + "kl": 0.0005960464477539062, + "learning_rate": 1.276595744680851e-08, + "loss": 0.06, + "num_tokens": 5233366.0, + "reward": 0.291294664144516, + "reward_std": 0.35457053780555725, + "rewards/accuracy_reward/mean": 0.1808035671710968, + "rewards/accuracy_reward/std": 0.3852855861186981, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1104910746216774, + "rewards/tag_count_reward/std": 0.1802973598241806, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1478.154052734375, + "completions/mean_terminated_length": 997.4197387695312, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.0017047573384476053, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1165773576803505, + "kl": 0.0007266998291015625, + "learning_rate": 1.4893617021276595e-08, + "loss": 0.0932, + "num_tokens": 5962123.0, + "reward": 0.5491071939468384, + "reward_std": 0.38611385226249695, + "rewards/accuracy_reward/mean": 0.4236111044883728, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.16899244487285614, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1435.9063720703125, + "completions/mean_terminated_length": 959.8333740234375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.001917852005753556, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10603288197538065, + "kl": 0.0006589889526367188, + "learning_rate": 1.702127659574468e-08, + "loss": 0.0621, + "num_tokens": 6686481.0, + "reward": 0.590401828289032, + "reward_std": 0.3635442852973938, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1506696492433548, + "rewards/tag_count_reward/std": 0.1734941452741623, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1382.2098388671875, + "completions/mean_terminated_length": 854.904052734375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.002130946673059507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11471021646683922, + "kl": 0.000652313232421875, + "learning_rate": 1.9148936170212764e-08, + "loss": 0.0799, + "num_tokens": 7371935.0, + "reward": 0.582589328289032, + "reward_std": 0.40267178416252136, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1584821492433548, + "rewards/tag_count_reward/std": 0.19785068929195404, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1536.4888916015625, + "completions/mean_terminated_length": 940.9613647460938, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.0023440413403654574, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09045626739258034, + "kl": 0.0005979537963867188, + "learning_rate": 2.127659574468085e-08, + "loss": 0.11, + "num_tokens": 8126682.0, + "reward": 0.4575892984867096, + "reward_std": 0.35018956661224365, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1272321492433548, + "rewards/tag_count_reward/std": 0.17537283897399902, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1438.700927734375, + "completions/mean_terminated_length": 901.0841064453125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.002557136007671408, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1092851498354237, + "kl": 0.0005674362182617188, + "learning_rate": 2.3404255319148933e-08, + "loss": 0.0486, + "num_tokens": 8838468.0, + "reward": 0.6579241156578064, + "reward_std": 0.3641918897628784, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1623883992433548, + "rewards/tag_count_reward/std": 0.18918608129024506, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1497.3460693359375, + "completions/mean_terminated_length": 895.2289428710938, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.0027702306749773586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11315634547553356, + "kl": 0.0006742477416992188, + "learning_rate": 2.553191489361702e-08, + "loss": 0.0854, + "num_tokens": 9586271.0, + "reward": 0.490513414144516, + "reward_std": 0.3397141993045807, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1266741007566452, + "rewards/tag_count_reward/std": 0.16298216581344604, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1476.227783203125, + "completions/mean_terminated_length": 924.517578125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0029833253422833095, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09741794238486032, + "kl": 0.00063323974609375, + "learning_rate": 2.7659574468085105e-08, + "loss": 0.0669, + "num_tokens": 10313509.0, + "reward": 0.5172991156578064, + "reward_std": 0.4070625603199005, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1356026828289032, + "rewards/tag_count_reward/std": 0.1774454265832901, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1312.8929443359375, + "completions/mean_terminated_length": 859.0902709960938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.00319642000958926, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11056495464072837, + "kl": 0.0006198883056640625, + "learning_rate": 2.978723404255319e-08, + "loss": 0.0777, + "num_tokens": 10973525.0, + "reward": 0.5881696939468384, + "reward_std": 0.41655388474464417, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1551339328289032, + "rewards/tag_count_reward/std": 0.19554999470710754, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1393.80810546875, + "completions/mean_terminated_length": 841.9176635742188, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0034095146768952107, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10355788177332209, + "kl": 0.0005626678466796875, + "learning_rate": 3.191489361702127e-08, + "loss": 0.056, + "num_tokens": 11668783.0, + "reward": 0.5334821939468384, + "reward_std": 0.3836354911327362, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1540178507566452, + "rewards/tag_count_reward/std": 0.1964321881532669, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1613.8504638671875, + "completions/mean_terminated_length": 776.7647094726562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0036226093442011612, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.09515067486346335, + "kl": 0.0006103515625, + "learning_rate": 3.404255319148936e-08, + "loss": 0.0763, + "num_tokens": 12459692.0, + "reward": 0.3895089328289032, + "reward_std": 0.32996246218681335, + "rewards/accuracy_reward/mean": 0.2901785671710968, + "rewards/accuracy_reward/std": 0.4543520212173462, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0993303582072258, + "rewards/tag_count_reward/std": 0.17105932533740997, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1384.009033203125, + "completions/mean_terminated_length": 823.851806640625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.003835704011507112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10973826470058803, + "kl": 0.00054168701171875, + "learning_rate": 3.617021276595744e-08, + "loss": 0.0804, + "num_tokens": 13153088.0, + "reward": 0.4603794813156128, + "reward_std": 0.343288779258728, + "rewards/accuracy_reward/mean": 0.3258928656578064, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1344866007566452, + "rewards/tag_count_reward/std": 0.1686217337846756, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1576.8013916015625, + "completions/mean_terminated_length": 1002.96533203125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.004048798678813062, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09850193451964769, + "kl": 0.0006160736083984375, + "learning_rate": 3.829787234042553e-08, + "loss": 0.0535, + "num_tokens": 13929223.0, + "reward": 0.4715402126312256, + "reward_std": 0.3523045480251312, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1277901828289032, + "rewards/tag_count_reward/std": 0.1800851821899414, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1434.8929443359375, + "completions/mean_terminated_length": 793.7899169921875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.004261893346119014, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10200327428815918, + "kl": 0.0005636215209960938, + "learning_rate": 4.042553191489362e-08, + "loss": 0.0899, + "num_tokens": 14655543.0, + "reward": 0.4693080484867096, + "reward_std": 0.36909398436546326, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399619221687317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1300223171710968, + "rewards/tag_count_reward/std": 0.19278773665428162, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1556.779052734375, + "completions/mean_terminated_length": 871.171142578125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.004474988013424964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09078999703545892, + "kl": 0.0005154609680175781, + "learning_rate": 4.25531914893617e-08, + "loss": 0.0339, + "num_tokens": 15421684.0, + "reward": 0.3738839328289032, + "reward_std": 0.3145078122615814, + "rewards/accuracy_reward/mean": 0.26157405972480774, + "rewards/accuracy_reward/std": 0.4400014281272888, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1216517835855484, + "rewards/tag_count_reward/std": 0.17773102223873138, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1498.372802734375, + "completions/mean_terminated_length": 869.8516235351562, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.004688082680730915, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10555568921525868, + "kl": 0.0006399154663085938, + "learning_rate": 4.4680851063829786e-08, + "loss": 0.108, + "num_tokens": 16160331.0, + "reward": 0.490513414144516, + "reward_std": 0.36637213826179504, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1311383992433548, + "rewards/tag_count_reward/std": 0.17686758935451508, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1437.4598388671875, + "completions/mean_terminated_length": 940.62353515625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.0049011773480368654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11060612805320082, + "kl": 0.000606536865234375, + "learning_rate": 4.6808510638297865e-08, + "loss": 0.0842, + "num_tokens": 16877817.0, + "reward": 0.535714328289032, + "reward_std": 0.39839938282966614, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1473214328289032, + "rewards/tag_count_reward/std": 0.1922827959060669, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1459.2344970703125, + "completions/mean_terminated_length": 930.3432006835938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.005114272015342816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10436240015138523, + "kl": 0.0006122589111328125, + "learning_rate": 4.893617021276596e-08, + "loss": 0.0704, + "num_tokens": 17603042.0, + "reward": 0.5412946939468384, + "reward_std": 0.4264402985572815, + "rewards/accuracy_reward/mean": 0.3958333432674408, + "rewards/accuracy_reward/std": 0.4895959198474884, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1595982164144516, + "rewards/tag_count_reward/std": 0.19408898055553436, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1578.3148193359375, + "completions/mean_terminated_length": 817.4795532226562, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0053273666826487666, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09628061315971112, + "kl": 0.0006122589111328125, + "learning_rate": 5.106382978723404e-08, + "loss": 0.0914, + "num_tokens": 18378399.0, + "reward": 0.4207589328289032, + "reward_std": 0.2939138412475586, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45739173889160156, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1238839253783226, + "rewards/tag_count_reward/std": 0.18470267951488495, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1626.9888916015625, + "completions/mean_terminated_length": 994.2960815429688, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.005540461349954717, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09993865678887126, + "kl": 0.0006809234619140625, + "learning_rate": 5.3191489361702123e-08, + "loss": 0.0513, + "num_tokens": 19174554.0, + "reward": 0.4988839626312256, + "reward_std": 0.3830823004245758, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1194196417927742, + "rewards/tag_count_reward/std": 0.16029927134513855, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1538.794677734375, + "completions/mean_terminated_length": 884.10205078125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.005753556017260668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10245408847716521, + "kl": 0.0005316734313964844, + "learning_rate": 5.531914893617021e-08, + "loss": 0.0669, + "num_tokens": 19932558.0, + "reward": 0.4469866156578064, + "reward_std": 0.3912772238254547, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1300223171710968, + "rewards/tag_count_reward/std": 0.18912668526172638, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1617.4754638671875, + "completions/mean_terminated_length": 952.1193237304688, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.005966650684566619, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.08597152328544956, + "kl": 0.0005950927734375, + "learning_rate": 5.7446808510638295e-08, + "loss": 0.0604, + "num_tokens": 20728131.0, + "reward": 0.4296875298023224, + "reward_std": 0.3333337903022766, + "rewards/accuracy_reward/mean": 0.3102678656578064, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1194196417927742, + "rewards/tag_count_reward/std": 0.187626451253891, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1547.743408203125, + "completions/mean_terminated_length": 874.623046875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.00617974535187257, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3009094483530404, + "kl": 0.0006208419799804688, + "learning_rate": 5.957446808510638e-08, + "loss": 0.082, + "num_tokens": 21497520.0, + "reward": 0.431919664144516, + "reward_std": 0.34356507658958435, + "rewards/accuracy_reward/mean": 0.3035714328289032, + "rewards/accuracy_reward/std": 0.46031373739242554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1283482164144516, + "rewards/tag_count_reward/std": 0.1999439299106598, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1552.841552734375, + "completions/mean_terminated_length": 976.3526611328125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.00639284001917852, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11999954723943836, + "kl": 0.000629425048828125, + "learning_rate": 6.170212765957446e-08, + "loss": 0.0996, + "num_tokens": 22260553.0, + "reward": 0.5078125, + "reward_std": 0.3757893443107605, + "rewards/accuracy_reward/mean": 0.38657405972480774, + "rewards/accuracy_reward/std": 0.4875292479991913, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1350446492433548, + "rewards/tag_count_reward/std": 0.1782640665769577, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1555.6273193359375, + "completions/mean_terminated_length": 868.4118041992188, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.006605934686484471, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09875543353978837, + "kl": 0.000629425048828125, + "learning_rate": 6.382978723404254e-08, + "loss": 0.1085, + "num_tokens": 23034354.0, + "reward": 0.420200914144516, + "reward_std": 0.40927058458328247, + "rewards/accuracy_reward/mean": 0.2991071343421936, + "rewards/accuracy_reward/std": 0.45837873220443726, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.12109375, + "rewards/tag_count_reward/std": 0.1800643801689148, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1567.5826416015625, + "completions/mean_terminated_length": 921.1571044921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.006819029353790421, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09796451753719258, + "kl": 0.0006122589111328125, + "learning_rate": 6.595744680851063e-08, + "loss": 0.1041, + "num_tokens": 23808775.0, + "reward": 0.4626116156578064, + "reward_std": 0.36975887417793274, + "rewards/accuracy_reward/mean": 0.35648149251937866, + "rewards/accuracy_reward/std": 0.47951504588127136, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1188616082072258, + "rewards/tag_count_reward/std": 0.17205896973609924, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1345.7857666015625, + "completions/mean_terminated_length": 842.6666259765625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.007032124021096372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12196101026133285, + "kl": 0.000682830810546875, + "learning_rate": 6.808510638297873e-08, + "loss": 0.0889, + "num_tokens": 24475687.0, + "reward": 0.6467634439468384, + "reward_std": 0.4578225910663605, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.16015625, + "rewards/tag_count_reward/std": 0.1836206167936325, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1346.9866943359375, + "completions/mean_terminated_length": 658.3805541992188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0072452186884023224, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10745798801090463, + "kl": 0.0006198883056640625, + "learning_rate": 7.02127659574468e-08, + "loss": 0.0601, + "num_tokens": 25150641.0, + "reward": 0.5072544813156128, + "reward_std": 0.3134019076824188, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1233258917927742, + "rewards/tag_count_reward/std": 0.16721661388874054, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1566.732177734375, + "completions/mean_terminated_length": 888.8171997070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.007458313355708273, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12964865572439357, + "kl": 0.0006628036499023438, + "learning_rate": 7.234042553191488e-08, + "loss": 0.0827, + "num_tokens": 25920777.0, + "reward": 0.3839285969734192, + "reward_std": 0.3860481381416321, + "rewards/accuracy_reward/mean": 0.2611607015132904, + "rewards/accuracy_reward/std": 0.43975841999053955, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1227678582072258, + "rewards/tag_count_reward/std": 0.1800929754972458, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1405.2188720703125, + "completions/mean_terminated_length": 891.510009765625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.007671408023014224, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1151012199106129, + "kl": 0.0005998611450195312, + "learning_rate": 7.446808510638298e-08, + "loss": 0.0638, + "num_tokens": 26628475.0, + "reward": 0.6545759439468384, + "reward_std": 0.3889673948287964, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1545758992433548, + "rewards/tag_count_reward/std": 0.1728697121143341, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1378.7232666015625, + "completions/mean_terminated_length": 961.6376953125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.007884502690320174, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10097242288441606, + "kl": 0.0005893707275390625, + "learning_rate": 7.659574468085106e-08, + "loss": 0.0848, + "num_tokens": 27312959.0, + "reward": 0.6194196939468384, + "reward_std": 0.3824792504310608, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1573660671710968, + "rewards/tag_count_reward/std": 0.18260905146598816, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1446.9710693359375, + "completions/mean_terminated_length": 856.5796508789062, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.008097597357626125, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.30447813766076604, + "kl": 0.000576019287109375, + "learning_rate": 7.872340425531915e-08, + "loss": 0.0205, + "num_tokens": 28031314.0, + "reward": 0.5111607313156128, + "reward_std": 0.36669179797172546, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1473214328289032, + "rewards/tag_count_reward/std": 0.19082291424274445, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1713.3751220703125, + "completions/mean_terminated_length": 1116.86962890625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.008310692024932075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10262973183741655, + "kl": 0.0006504058837890625, + "learning_rate": 8.085106382978724e-08, + "loss": 0.0872, + "num_tokens": 28871850.0, + "reward": 0.4151785969734192, + "reward_std": 0.38343989849090576, + "rewards/accuracy_reward/mean": 0.2879464328289032, + "rewards/accuracy_reward/std": 0.4533122181892395, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1272321492433548, + "rewards/tag_count_reward/std": 0.17215418815612793, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1496.3929443359375, + "completions/mean_terminated_length": 934.8468627929688, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.008523786692238028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8394464547555742, + "kl": 0.0006561279296875, + "learning_rate": 8.297872340425531e-08, + "loss": 0.0913, + "num_tokens": 29614746.0, + "reward": 0.5111607313156128, + "reward_std": 0.4580632746219635, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1294642835855484, + "rewards/tag_count_reward/std": 0.17612579464912415, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1538.446533203125, + "completions/mean_terminated_length": 807.3478393554688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.008736881359543978, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.08706024487907357, + "kl": 0.0006227493286132812, + "learning_rate": 8.51063829787234e-08, + "loss": 0.0651, + "num_tokens": 30370306.0, + "reward": 0.4648437798023224, + "reward_std": 0.24940386414527893, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399619221687317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1255580335855484, + "rewards/tag_count_reward/std": 0.16554337739944458, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1661.513427734375, + "completions/mean_terminated_length": 945.1592407226562, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.008949976026849929, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.09806317826615497, + "kl": 0.0006170272827148438, + "learning_rate": 8.723404255319149e-08, + "loss": 0.0629, + "num_tokens": 31193624.0, + "reward": 0.4073660969734192, + "reward_std": 0.2731107175350189, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.46403056383132935, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0948660746216774, + "rewards/tag_count_reward/std": 0.14553913474082947, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1494.930908203125, + "completions/mean_terminated_length": 956.4801635742188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.00916307069415588, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.0128460757898141, + "kl": 0.000698089599609375, + "learning_rate": 8.936170212765957e-08, + "loss": 0.0503, + "num_tokens": 31932217.0, + "reward": 0.5189732313156128, + "reward_std": 0.35851016640663147, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1462053507566452, + "rewards/tag_count_reward/std": 0.17885133624076843, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1457.99560546875, + "completions/mean_terminated_length": 867.9910888671875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.00937616536146183, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11776272944133993, + "kl": 0.0006351470947265625, + "learning_rate": 9.148936170212765e-08, + "loss": 0.1127, + "num_tokens": 32652839.0, + "reward": 0.5340402126312256, + "reward_std": 0.43986910581588745, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1545758992433548, + "rewards/tag_count_reward/std": 0.18686223030090332, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1485.26123046875, + "completions/mean_terminated_length": 912.3829345703125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.00958926002876778, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1035863264780186, + "kl": 0.00067138671875, + "learning_rate": 9.361702127659573e-08, + "loss": 0.0447, + "num_tokens": 33387820.0, + "reward": 0.4587053656578064, + "reward_std": 0.36645394563674927, + "rewards/accuracy_reward/mean": 0.3370535671710968, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1216517835855484, + "rewards/tag_count_reward/std": 0.16802562773227692, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1382.1876220703125, + "completions/mean_terminated_length": 887.3618774414062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.009802354696073731, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11381146694180108, + "kl": 0.0007276535034179688, + "learning_rate": 9.574468085106382e-08, + "loss": 0.0277, + "num_tokens": 34072976.0, + "reward": 0.5853794813156128, + "reward_std": 0.41184261441230774, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1568080335855484, + "rewards/tag_count_reward/std": 0.18423150479793549, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1514.1473388671875, + "completions/mean_terminated_length": 881.3366088867188, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.010015449363379681, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09565738552322595, + "kl": 0.0006103515625, + "learning_rate": 9.787234042553192e-08, + "loss": 0.1223, + "num_tokens": 34824370.0, + "reward": 0.4804687798023224, + "reward_std": 0.360610693693161, + "rewards/accuracy_reward/mean": 0.35185185074806213, + "rewards/accuracy_reward/std": 0.4781017303466797, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1411830335855484, + "rewards/tag_count_reward/std": 0.19434207677841187, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1428.7054443359375, + "completions/mean_terminated_length": 792.5972900390625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.010228544030685632, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.2438327554885991, + "kl": 0.0006103515625, + "learning_rate": 1e-07, + "loss": 0.0925, + "num_tokens": 35537726.0, + "reward": 0.4614955484867096, + "reward_std": 0.33448395133018494, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1333705335855484, + "rewards/tag_count_reward/std": 0.18375654518604279, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1458.5804443359375, + "completions/mean_terminated_length": 879.5928955078125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.010441638697991583, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.0977685531678709, + "kl": 0.0006132125854492188, + "learning_rate": 1.0212765957446807e-07, + "loss": 0.0798, + "num_tokens": 36260242.0, + "reward": 0.5602678656578064, + "reward_std": 0.3839297592639923, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1383928507566452, + "rewards/tag_count_reward/std": 0.17164580523967743, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1551.4442138671875, + "completions/mean_terminated_length": 978.4952392578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.010654733365297533, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.4952207656777053, + "kl": 0.0005893707275390625, + "learning_rate": 1.0425531914893617e-07, + "loss": 0.0671, + "num_tokens": 37026313.0, + "reward": 0.4910714626312256, + "reward_std": 0.3419077694416046, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1272321492433548, + "rewards/tag_count_reward/std": 0.16637177765369415, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1255.325927734375, + "completions/mean_terminated_length": 779.721435546875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.010867828032603484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1412960599140765, + "kl": 0.0006103515625, + "learning_rate": 1.0638297872340425e-07, + "loss": 0.0137, + "num_tokens": 37653003.0, + "reward": 0.5574777126312256, + "reward_std": 0.40723317861557007, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1646205335855484, + "rewards/tag_count_reward/std": 0.1945667564868927, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1410.7991943359375, + "completions/mean_terminated_length": 843.5020751953125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.011080922699909434, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11296164846364418, + "kl": 0.0006666183471679688, + "learning_rate": 1.0851063829787234e-07, + "loss": 0.0584, + "num_tokens": 38347265.0, + "reward": 0.5652902126312256, + "reward_std": 0.3664935827255249, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.13671875, + "rewards/tag_count_reward/std": 0.1684810072183609, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1343.493408203125, + "completions/mean_terminated_length": 883.354248046875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.011294017367215385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1357720870845481, + "kl": 0.0006160736083984375, + "learning_rate": 1.1063829787234042e-07, + "loss": 0.1081, + "num_tokens": 39007150.0, + "reward": 0.6556919813156128, + "reward_std": 0.40325039625167847, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1668526828289032, + "rewards/tag_count_reward/std": 0.17356158792972565, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1412.3817138671875, + "completions/mean_terminated_length": 836.26806640625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.011507112034521335, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10810936118624642, + "kl": 0.00066375732421875, + "learning_rate": 1.127659574468085e-07, + "loss": 0.0865, + "num_tokens": 39704041.0, + "reward": 0.52734375, + "reward_std": 0.32285118103027344, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.17759311199188232, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1373.024658203125, + "completions/mean_terminated_length": 889.4214477539062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.011720206701827288, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12102843929228482, + "kl": 0.0006570816040039062, + "learning_rate": 1.1489361702127659e-07, + "loss": 0.0903, + "num_tokens": 40391124.0, + "reward": 0.6618303656578064, + "reward_std": 0.38166266679763794, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1529017835855484, + "rewards/tag_count_reward/std": 0.16740036010742188, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1613.6629638671875, + "completions/mean_terminated_length": 1001.8548583984375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.011933301369133238, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15827625867598152, + "kl": 0.00077056884765625, + "learning_rate": 1.1702127659574468e-07, + "loss": 0.0564, + "num_tokens": 41188589.0, + "reward": 0.4810267984867096, + "reward_std": 0.4002097249031067, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1149553582072258, + "rewards/tag_count_reward/std": 0.15654832124710083, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1444.0535888671875, + "completions/mean_terminated_length": 891.7265625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.012146396036439189, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.34802836790378183, + "kl": 0.000701904296875, + "learning_rate": 1.1914893617021276e-07, + "loss": 0.0279, + "num_tokens": 41907045.0, + "reward": 0.5736607313156128, + "reward_std": 0.3971804976463318, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.15963202714920044, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1524.138427734375, + "completions/mean_terminated_length": 868.6532592773438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.01235949070374514, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1009940678802228, + "kl": 0.00066375732421875, + "learning_rate": 1.2127659574468084e-07, + "loss": 0.078, + "num_tokens": 42659059.0, + "reward": 0.3900669813156128, + "reward_std": 0.3522178530693054, + "rewards/accuracy_reward/mean": 0.2566964328289032, + "rewards/accuracy_reward/std": 0.4372987151145935, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1333705335855484, + "rewards/tag_count_reward/std": 0.18451586365699768, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1516.0491943359375, + "completions/mean_terminated_length": 832.1122436523438, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.01257258537105109, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10333814503815872, + "kl": 0.0006532669067382812, + "learning_rate": 1.2340425531914892e-07, + "loss": 0.0765, + "num_tokens": 43409465.0, + "reward": 0.3671875298023224, + "reward_std": 0.2861935496330261, + "rewards/accuracy_reward/mean": 0.2455357164144516, + "rewards/accuracy_reward/std": 0.4308854937553406, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1216517835855484, + "rewards/tag_count_reward/std": 0.1792975217103958, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1369.654052734375, + "completions/mean_terminated_length": 827.5220336914062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.01278568003835704, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10919075700279716, + "kl": 0.0006704330444335938, + "learning_rate": 1.25531914893617e-07, + "loss": 0.0528, + "num_tokens": 44090958.0, + "reward": 0.5178571939468384, + "reward_std": 0.39728257060050964, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1339285671710968, + "rewards/tag_count_reward/std": 0.17355531454086304, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1584.8035888671875, + "completions/mean_terminated_length": 901.52490234375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.012998774705662991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10452449253069136, + "kl": 0.0005750656127929688, + "learning_rate": 1.2765957446808508e-07, + "loss": 0.0925, + "num_tokens": 44871766.0, + "reward": 0.3822544813156128, + "reward_std": 0.3722384572029114, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.43349677324295044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1322544664144516, + "rewards/tag_count_reward/std": 0.19702155888080597, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1499.5335693359375, + "completions/mean_terminated_length": 807.0252685546875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.013211869372968942, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10269737040638459, + "kl": 0.0006465911865234375, + "learning_rate": 1.2978723404255319e-07, + "loss": 0.0809, + "num_tokens": 45616389.0, + "reward": 0.4665178656578064, + "reward_std": 0.3160642385482788, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378541469574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1138392835855484, + "rewards/tag_count_reward/std": 0.16601121425628662, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1543.1629638671875, + "completions/mean_terminated_length": 986.18310546875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.013424964040274892, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14117659955961492, + "kl": 0.0006532669067382812, + "learning_rate": 1.3191489361702127e-07, + "loss": 0.0802, + "num_tokens": 46379694.0, + "reward": 0.4854910969734192, + "reward_std": 0.35438334941864014, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1283482164144516, + "rewards/tag_count_reward/std": 0.15772415697574615, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1556.7210693359375, + "completions/mean_terminated_length": 942.0050048828125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.013638058707580843, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10516515272337294, + "kl": 0.0006771087646484375, + "learning_rate": 1.3404255319148934e-07, + "loss": 0.0218, + "num_tokens": 47145953.0, + "reward": 0.4017857313156128, + "reward_std": 0.3413859009742737, + "rewards/accuracy_reward/mean": 0.2790178656578064, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1227678582072258, + "rewards/tag_count_reward/std": 0.16721008718013763, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1427.5001220703125, + "completions/mean_terminated_length": 894.5394897460938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.013851153374886793, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14553769231051605, + "kl": 0.0005903244018554688, + "learning_rate": 1.3617021276595745e-07, + "loss": 0.0675, + "num_tokens": 47849697.0, + "reward": 0.5027902126312256, + "reward_std": 0.3909113109111786, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.4822677969932556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.13671875, + "rewards/tag_count_reward/std": 0.19394339621067047, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1459.2344970703125, + "completions/mean_terminated_length": 966.9876708984375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.014064248042192744, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10345709313267537, + "kl": 0.0006418228149414062, + "learning_rate": 1.3829787234042553e-07, + "loss": 0.0603, + "num_tokens": 48568570.0, + "reward": 0.5334821939468384, + "reward_std": 0.3349814713001251, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1428571492433548, + "rewards/tag_count_reward/std": 0.16372442245483398, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1437.435302734375, + "completions/mean_terminated_length": 922.3497924804688, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.014277342709498694, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1616650053839849, + "kl": 0.0005855560302734375, + "learning_rate": 1.404255319148936e-07, + "loss": 0.0946, + "num_tokens": 49271421.0, + "reward": 0.5719866156578064, + "reward_std": 0.34255507588386536, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1568080335855484, + "rewards/tag_count_reward/std": 0.19671083986759186, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1536.024658203125, + "completions/mean_terminated_length": 895.4120483398438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.014490437376804645, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.23135387511604344, + "kl": 0.0006227493286132812, + "learning_rate": 1.4255319148936172e-07, + "loss": 0.0724, + "num_tokens": 50038696.0, + "reward": 0.4469866156578064, + "reward_std": 0.3783811330795288, + "rewards/accuracy_reward/mean": 0.3258928656578064, + "rewards/accuracy_reward/std": 0.46923142671585083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.12109375, + "rewards/tag_count_reward/std": 0.17293468117713928, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1371.52685546875, + "completions/mean_terminated_length": 891.282470703125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.014703532044110595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10727099539961585, + "kl": 0.00057220458984375, + "learning_rate": 1.4468085106382977e-07, + "loss": 0.0731, + "num_tokens": 50723620.0, + "reward": 0.5245535969734192, + "reward_std": 0.3711250424385071, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1629464328289032, + "rewards/tag_count_reward/std": 0.19346082210540771, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1441.6451416015625, + "completions/mean_terminated_length": 901.81005859375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.014916626711416546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10465712838302302, + "kl": 0.0006055831909179688, + "learning_rate": 1.4680851063829785e-07, + "loss": 0.0462, + "num_tokens": 51430517.0, + "reward": 0.5831473469734192, + "reward_std": 0.310307115316391, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1411830335855484, + "rewards/tag_count_reward/std": 0.157814159989357, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1391.1094970703125, + "completions/mean_terminated_length": 831.93798828125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.015129721378722498, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1106227567249354, + "kl": 0.0005865097045898438, + "learning_rate": 1.4893617021276595e-07, + "loss": 0.0918, + "num_tokens": 52123830.0, + "reward": 0.5452009439468384, + "reward_std": 0.3809743821620941, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1501116007566452, + "rewards/tag_count_reward/std": 0.182987242937088, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1392.232177734375, + "completions/mean_terminated_length": 991.2230224609375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.015342816046028449, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10862713855052201, + "kl": 0.0006055831909179688, + "learning_rate": 1.5106382978723403e-07, + "loss": 0.033, + "num_tokens": 52812910.0, + "reward": 0.6010044813156128, + "reward_std": 0.35854873061180115, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1568080335855484, + "rewards/tag_count_reward/std": 0.1700226068496704, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1403.3326416015625, + "completions/mean_terminated_length": 937.1884765625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.0155559107133344, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11132467714117722, + "kl": 0.0006723403930664062, + "learning_rate": 1.531914893617021e-07, + "loss": 0.0977, + "num_tokens": 53513347.0, + "reward": 0.641183078289032, + "reward_std": 0.371239572763443, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1657366007566452, + "rewards/tag_count_reward/std": 0.18092206120491028, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1589.3192138671875, + "completions/mean_terminated_length": 912.7017211914062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.015769005380640348, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09876810544299988, + "kl": 0.0006694793701171875, + "learning_rate": 1.5531914893617022e-07, + "loss": 0.0526, + "num_tokens": 54292754.0, + "reward": 0.459263414144516, + "reward_std": 0.4004373848438263, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1244419664144516, + "rewards/tag_count_reward/std": 0.18242010474205017, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1407.1920166015625, + "completions/mean_terminated_length": 930.9494018554688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0159821000479463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13130369092024977, + "kl": 0.0006513595581054688, + "learning_rate": 1.574468085106383e-07, + "loss": 0.0654, + "num_tokens": 54988840.0, + "reward": 0.5859375, + "reward_std": 0.33688119053840637, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1529017835855484, + "rewards/tag_count_reward/std": 0.17555426061153412, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1507.524658203125, + "completions/mean_terminated_length": 855.2265625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.01619519471525225, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10490041011979272, + "kl": 0.0006437301635742188, + "learning_rate": 1.5957446808510638e-07, + "loss": 0.0503, + "num_tokens": 55730531.0, + "reward": 0.504464328289032, + "reward_std": 0.33776572346687317, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1495535671710968, + "rewards/tag_count_reward/std": 0.21339333057403564, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1523.560302734375, + "completions/mean_terminated_length": 923.842041015625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.0164082893825582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10986942428119216, + "kl": 0.0005559921264648438, + "learning_rate": 1.6170212765957448e-07, + "loss": 0.0985, + "num_tokens": 56480542.0, + "reward": 0.5463169813156128, + "reward_std": 0.3517903685569763, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1378348171710968, + "rewards/tag_count_reward/std": 0.18652118742465973, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1352.712158203125, + "completions/mean_terminated_length": 970.1834106445312, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.01662138404986415, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11358275964744895, + "kl": 0.0005674362182617188, + "learning_rate": 1.6382978723404254e-07, + "loss": 0.1028, + "num_tokens": 57151981.0, + "reward": 0.5931919813156128, + "reward_std": 0.3596561551094055, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1690848171710968, + "rewards/tag_count_reward/std": 0.17935582995414734, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1519.7723388671875, + "completions/mean_terminated_length": 904.7825927734375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.016834478717170104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10108913185978581, + "kl": 0.0005426406860351562, + "learning_rate": 1.6595744680851062e-07, + "loss": 0.1016, + "num_tokens": 57900567.0, + "reward": 0.5290178656578064, + "reward_std": 0.3699549734592438, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1428571492433548, + "rewards/tag_count_reward/std": 0.17686142027378082, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1466.3035888671875, + "completions/mean_terminated_length": 943.7626953125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.017047573384476055, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.27266530804882455, + "kl": 0.000659942626953125, + "learning_rate": 1.6808510638297872e-07, + "loss": 0.0881, + "num_tokens": 58624063.0, + "reward": 0.4508928656578064, + "reward_std": 0.3456571400165558, + "rewards/accuracy_reward/mean": 0.3102678656578064, + "rewards/accuracy_reward/std": 0.46312034130096436, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.18555572628974915, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1604.509033203125, + "completions/mean_terminated_length": 1029.107666015625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.017260668051782006, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.27194617401617815, + "kl": 0.0006427764892578125, + "learning_rate": 1.702127659574468e-07, + "loss": 0.0498, + "num_tokens": 59416195.0, + "reward": 0.5368303656578064, + "reward_std": 0.37965935468673706, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1238839253783226, + "rewards/tag_count_reward/std": 0.16039270162582397, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1410.837158203125, + "completions/mean_terminated_length": 915.2659301757812, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.017473762719087956, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12917745958808036, + "kl": 0.00067138671875, + "learning_rate": 1.7234042553191488e-07, + "loss": 0.0966, + "num_tokens": 60120074.0, + "reward": 0.621651828289032, + "reward_std": 0.3663535714149475, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1551339328289032, + "rewards/tag_count_reward/std": 0.17598041892051697, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1607.3795166015625, + "completions/mean_terminated_length": 1025.21240234375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.017686857386393907, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12240175494287042, + "kl": 0.0006380081176757812, + "learning_rate": 1.7446808510638299e-07, + "loss": 0.0692, + "num_tokens": 60908196.0, + "reward": 0.4079241156578064, + "reward_std": 0.29820477962493896, + "rewards/accuracy_reward/mean": 0.2879464328289032, + "rewards/accuracy_reward/std": 0.4533121883869171, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1199776753783226, + "rewards/tag_count_reward/std": 0.15767961740493774, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1484.4844970703125, + "completions/mean_terminated_length": 879.2268676757812, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.017899952053699857, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09501206879675293, + "kl": 0.0005626678466796875, + "learning_rate": 1.7659574468085106e-07, + "loss": 0.1089, + "num_tokens": 61636637.0, + "reward": 0.459263414144516, + "reward_std": 0.40043362975120544, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1199776753783226, + "rewards/tag_count_reward/std": 0.18311679363250732, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1551.47998046875, + "completions/mean_terminated_length": 871.0634765625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.018113046721005808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1024884959406532, + "kl": 0.0006132125854492188, + "learning_rate": 1.7872340425531914e-07, + "loss": 0.1096, + "num_tokens": 62402452.0, + "reward": 0.4324777126312256, + "reward_std": 0.3555998206138611, + "rewards/accuracy_reward/mean": 0.3035714328289032, + "rewards/accuracy_reward/std": 0.46031373739242554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.12890625, + "rewards/tag_count_reward/std": 0.16966979205608368, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1443.3817138671875, + "completions/mean_terminated_length": 919.3792114257812, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.01832614138831176, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10653421298342539, + "kl": 0.0006618499755859375, + "learning_rate": 1.8085106382978725e-07, + "loss": 0.0725, + "num_tokens": 63116687.0, + "reward": 0.6194196939468384, + "reward_std": 0.3753754496574402, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1506696492433548, + "rewards/tag_count_reward/std": 0.1718747764825821, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1498.6785888671875, + "completions/mean_terminated_length": 959.0796508789062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.01853923605561771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0995652245577469, + "kl": 0.0005559921264648438, + "learning_rate": 1.829787234042553e-07, + "loss": 0.1143, + "num_tokens": 63859839.0, + "reward": 0.4760044813156128, + "reward_std": 0.4069821238517761, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1456473171710968, + "rewards/tag_count_reward/std": 0.17969655990600586, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1625.6295166015625, + "completions/mean_terminated_length": 1101.8900146484375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.01875233072292366, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09725143224765981, + "kl": 0.0006198883056640625, + "learning_rate": 1.8510638297872338e-07, + "loss": 0.0878, + "num_tokens": 64654553.0, + "reward": 0.5206473469734192, + "reward_std": 0.332084059715271, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.17117878794670105, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1465.9754638671875, + "completions/mean_terminated_length": 904.372802734375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.01896542539022961, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.09620892501706234, + "kl": 0.0006999969482421875, + "learning_rate": 1.8723404255319146e-07, + "loss": 0.056, + "num_tokens": 65378878.0, + "reward": 0.5089285969734192, + "reward_std": 0.3440691828727722, + "rewards/accuracy_reward/mean": 0.38657405972480774, + "rewards/accuracy_reward/std": 0.4875292479991913, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1361607164144516, + "rewards/tag_count_reward/std": 0.16431809961795807, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1402.18310546875, + "completions/mean_terminated_length": 857.3579711914062, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.01917852005753556, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10804353703288051, + "kl": 0.0005998611450195312, + "learning_rate": 1.8936170212765957e-07, + "loss": 0.1313, + "num_tokens": 66075712.0, + "reward": 0.59765625, + "reward_std": 0.33677539229393005, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1534598171710968, + "rewards/tag_count_reward/std": 0.1722475290298462, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1543.1629638671875, + "completions/mean_terminated_length": 917.1649780273438, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.01939161472484151, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09515397637834716, + "kl": 0.000644683837890625, + "learning_rate": 1.9148936170212765e-07, + "loss": 0.0675, + "num_tokens": 66839065.0, + "reward": 0.420200914144516, + "reward_std": 0.3352234661579132, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.45011183619499207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1389508992433548, + "rewards/tag_count_reward/std": 0.19234690070152283, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1482.5848388671875, + "completions/mean_terminated_length": 917.169677734375, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.019604709392147462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1068141667938709, + "kl": 0.0005855560302734375, + "learning_rate": 1.9361702127659573e-07, + "loss": 0.0812, + "num_tokens": 67571631.0, + "reward": 0.5496652126312256, + "reward_std": 0.3237675428390503, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1456473171710968, + "rewards/tag_count_reward/std": 0.1789167821407318, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1530.540283203125, + "completions/mean_terminated_length": 984.5963134765625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.019817804059453412, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09780008866627878, + "kl": 0.0006361007690429688, + "learning_rate": 1.9574468085106383e-07, + "loss": 0.0536, + "num_tokens": 68336577.0, + "reward": 0.4927455484867096, + "reward_std": 0.34423699975013733, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.12890625, + "rewards/tag_count_reward/std": 0.16034892201423645, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1468.466552734375, + "completions/mean_terminated_length": 961.6777954101562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.020030898726759363, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10978206593533296, + "kl": 0.0005826950073242188, + "learning_rate": 1.978723404255319e-07, + "loss": 0.0805, + "num_tokens": 69068178.0, + "reward": 0.555245578289032, + "reward_std": 0.3736456334590912, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1400669664144516, + "rewards/tag_count_reward/std": 0.16821405291557312, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1644.435302734375, + "completions/mean_terminated_length": 1008.936767578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.020243993394065313, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1545924523125092, + "kl": 0.0007457733154296875, + "learning_rate": 2e-07, + "loss": 0.0493, + "num_tokens": 69878165.0, + "reward": 0.3917410969734192, + "reward_std": 0.340265154838562, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44215917587280273, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1261160671710968, + "rewards/tag_count_reward/std": 0.18164943158626556, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1456.8907470703125, + "completions/mean_terminated_length": 949.17431640625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.020457088061371264, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.30280277963362745, + "kl": 0.0006666183471679688, + "learning_rate": 2.0212765957446807e-07, + "loss": 0.1121, + "num_tokens": 70603604.0, + "reward": 0.5251116156578064, + "reward_std": 0.38898965716362, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1501116007566452, + "rewards/tag_count_reward/std": 0.17834369838237762, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1396.743408203125, + "completions/mean_terminated_length": 975.3419189453125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.020670182728677215, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09653671528525909, + "kl": 0.0006580352783203125, + "learning_rate": 2.0425531914893615e-07, + "loss": 0.0432, + "num_tokens": 71300305.0, + "reward": 0.6110491156578064, + "reward_std": 0.3495582640171051, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1467633992433548, + "rewards/tag_count_reward/std": 0.16747771203517914, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1491.6116943359375, + "completions/mean_terminated_length": 940.1688842773438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.020883277395983165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8778560965538851, + "kl": 0.0006694793701171875, + "learning_rate": 2.0638297872340423e-07, + "loss": 0.074, + "num_tokens": 72035571.0, + "reward": 0.5223214626312256, + "reward_std": 0.3277195692062378, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1607142835855484, + "rewards/tag_count_reward/std": 0.19532322883605957, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1413.325927734375, + "completions/mean_terminated_length": 761.4208374023438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.021096372063289116, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11767497920125451, + "kl": 0.000736236572265625, + "learning_rate": 2.0851063829787233e-07, + "loss": 0.0494, + "num_tokens": 72731221.0, + "reward": 0.5178571939468384, + "reward_std": 0.34111085534095764, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1339285671710968, + "rewards/tag_count_reward/std": 0.1719365119934082, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1466.2098388671875, + "completions/mean_terminated_length": 884.419677734375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.021309466730595066, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1027935139762827, + "kl": 0.0005636215209960938, + "learning_rate": 2.1063829787234041e-07, + "loss": 0.0905, + "num_tokens": 73457123.0, + "reward": 0.4642857313156128, + "reward_std": 0.3221930265426636, + "rewards/accuracy_reward/mean": 0.3236607015132904, + "rewards/accuracy_reward/std": 0.46839529275894165, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.1754866987466812, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1335.977783203125, + "completions/mean_terminated_length": 951.8281860351562, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.021522561397901017, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.29290425659683816, + "kl": 0.000652313232421875, + "learning_rate": 2.127659574468085e-07, + "loss": 0.0692, + "num_tokens": 74112921.0, + "reward": 0.6774553656578064, + "reward_std": 0.36400285363197327, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1618303507566452, + "rewards/tag_count_reward/std": 0.1681741625070572, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1609.1407470703125, + "completions/mean_terminated_length": 911.5317993164062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.021735656065206967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10055986763775289, + "kl": 0.0006399154663085938, + "learning_rate": 2.148936170212766e-07, + "loss": 0.0764, + "num_tokens": 74907400.0, + "reward": 0.3805803656578064, + "reward_std": 0.29630815982818604, + "rewards/accuracy_reward/mean": 0.2566964328289032, + "rewards/accuracy_reward/std": 0.4372987747192383, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1238839253783226, + "rewards/tag_count_reward/std": 0.169711172580719, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1511.0379638671875, + "completions/mean_terminated_length": 874.5414428710938, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.021948750732512918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2765706723648975, + "kl": 0.000591278076171875, + "learning_rate": 2.1702127659574468e-07, + "loss": 0.1189, + "num_tokens": 75653033.0, + "reward": 0.4765625298023224, + "reward_std": 0.370633989572525, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.47086748480796814, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1462053507566452, + "rewards/tag_count_reward/std": 0.19240935146808624, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1562.7523193359375, + "completions/mean_terminated_length": 1050.7935791015625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.02216184539981887, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.22219206919252477, + "kl": 0.0006437301635742188, + "learning_rate": 2.1914893617021276e-07, + "loss": 0.0435, + "num_tokens": 76422474.0, + "reward": 0.5775669813156128, + "reward_std": 0.4197610020637512, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1378348171710968, + "rewards/tag_count_reward/std": 0.16248351335525513, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1497.3126220703125, + "completions/mean_terminated_length": 832.6896362304688, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.02237494006712482, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.08405243328749457, + "kl": 0.00061798095703125, + "learning_rate": 2.2127659574468084e-07, + "loss": 0.0677, + "num_tokens": 77162566.0, + "reward": 0.4793527126312256, + "reward_std": 0.3031631410121918, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1110491082072258, + "rewards/tag_count_reward/std": 0.1748300939798355, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1625.0157470703125, + "completions/mean_terminated_length": 1119.0931396484375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.02258803473443077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09763429596861473, + "kl": 0.0006494522094726562, + "learning_rate": 2.2340425531914892e-07, + "loss": 0.0522, + "num_tokens": 77973837.0, + "reward": 0.504464328289032, + "reward_std": 0.40397918224334717, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1294642835855484, + "rewards/tag_count_reward/std": 0.15136271715164185, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1463.2723388671875, + "completions/mean_terminated_length": 938.0084838867188, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.02280112940173672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10807042921158809, + "kl": 0.0006504058837890625, + "learning_rate": 2.25531914893617e-07, + "loss": 0.0468, + "num_tokens": 78699927.0, + "reward": 0.5256696939468384, + "reward_std": 0.394024521112442, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1484375, + "rewards/tag_count_reward/std": 0.18397125601768494, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1410.4398193359375, + "completions/mean_terminated_length": 872.5802001953125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.02301422406904267, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11091067162003142, + "kl": 0.0005054473876953125, + "learning_rate": 2.276595744680851e-07, + "loss": 0.0834, + "num_tokens": 79398924.0, + "reward": 0.570870578289032, + "reward_std": 0.35215818881988525, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1623883992433548, + "rewards/tag_count_reward/std": 0.17853958904743195, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1504.743408203125, + "completions/mean_terminated_length": 994.4112548828125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.02322731873634862, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09505077221753427, + "kl": 0.0005846023559570312, + "learning_rate": 2.2978723404255318e-07, + "loss": 0.06, + "num_tokens": 80137577.0, + "reward": 0.5831473469734192, + "reward_std": 0.39024558663368225, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.1768040508031845, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1589.3126220703125, + "completions/mean_terminated_length": 1083.2489013671875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.023440413403654575, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09416312435280728, + "kl": 0.0006666183471679688, + "learning_rate": 2.3191489361702126e-07, + "loss": 0.068, + "num_tokens": 80924645.0, + "reward": 0.5418527126312256, + "reward_std": 0.37803032994270325, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1489955335855484, + "rewards/tag_count_reward/std": 0.18237218260765076, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1569.274658203125, + "completions/mean_terminated_length": 793.7953491210938, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.023653508070960526, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09445706322437208, + "kl": 0.0006265640258789062, + "learning_rate": 2.3404255319148937e-07, + "loss": 0.0926, + "num_tokens": 81707072.0, + "reward": 0.291294664144516, + "reward_std": 0.33396586775779724, + "rewards/accuracy_reward/mean": 0.1852678507566452, + "rewards/accuracy_reward/std": 0.38894903659820557, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1060267835855484, + "rewards/tag_count_reward/std": 0.16947561502456665, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1318.1629638671875, + "completions/mean_terminated_length": 713.4407958984375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.023866602738266476, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10026729129208353, + "kl": 0.0005350112915039062, + "learning_rate": 2.3617021276595745e-07, + "loss": 0.0808, + "num_tokens": 82364361.0, + "reward": 0.55078125, + "reward_std": 0.3883152902126312, + "rewards/accuracy_reward/mean": 0.4097222089767456, + "rewards/accuracy_reward/std": 0.49235257506370544, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1556919664144516, + "rewards/tag_count_reward/std": 0.19330507516860962, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1538.6429443359375, + "completions/mean_terminated_length": 859.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.024079697405572427, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3987459032283757, + "kl": 0.0007219314575195312, + "learning_rate": 2.3829787234042553e-07, + "loss": 0.1144, + "num_tokens": 83123113.0, + "reward": 0.4140625298023224, + "reward_std": 0.3538362681865692, + "rewards/accuracy_reward/mean": 0.2834821343421936, + "rewards/accuracy_reward/std": 0.4511922299861908, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1305803507566452, + "rewards/tag_count_reward/std": 0.19349630177021027, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1538.2410888671875, + "completions/mean_terminated_length": 933.990234375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.024292792072878377, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11072220993669099, + "kl": 0.0007066726684570312, + "learning_rate": 2.404255319148936e-07, + "loss": 0.1026, + "num_tokens": 83885317.0, + "reward": 0.4910714626312256, + "reward_std": 0.3769613802433014, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1450892835855484, + "rewards/tag_count_reward/std": 0.18360786139965057, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1453.040283203125, + "completions/mean_terminated_length": 884.0611572265625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.024505886740184328, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10832449307063403, + "kl": 0.0007677078247070312, + "learning_rate": 2.425531914893617e-07, + "loss": 0.0841, + "num_tokens": 84601255.0, + "reward": 0.5675223469734192, + "reward_std": 0.3686137795448303, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1478794664144516, + "rewards/tag_count_reward/std": 0.1746872216463089, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1678.15185546875, + "completions/mean_terminated_length": 897.3611450195312, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.02471898140749028, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10803386197177677, + "kl": 0.00069427490234375, + "learning_rate": 2.4468085106382976e-07, + "loss": 0.0946, + "num_tokens": 85424011.0, + "reward": 0.322544664144516, + "reward_std": 0.28670406341552734, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4138607978820801, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1037946417927742, + "rewards/tag_count_reward/std": 0.16417749226093292, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1465.325927734375, + "completions/mean_terminated_length": 969.33056640625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.02493207607479623, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12501492248088594, + "kl": 0.00067901611328125, + "learning_rate": 2.4680851063829784e-07, + "loss": 0.0718, + "num_tokens": 86160477.0, + "reward": 0.547433078289032, + "reward_std": 0.4016090929508209, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.16796875, + "rewards/tag_count_reward/std": 0.18947620689868927, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1441.227783203125, + "completions/mean_terminated_length": 886.3162841796875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.02514517074210218, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1035698561749098, + "kl": 0.0006999969482421875, + "learning_rate": 2.489361702127659e-07, + "loss": 0.0778, + "num_tokens": 86878835.0, + "reward": 0.4960937798023224, + "reward_std": 0.3432289958000183, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378541469574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.1860520839691162, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1485.966552734375, + "completions/mean_terminated_length": 1032.713623046875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.02535826540940813, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12158033555966147, + "kl": 0.0007266998291015625, + "learning_rate": 2.51063829787234e-07, + "loss": 0.0667, + "num_tokens": 87610900.0, + "reward": 0.6183035969734192, + "reward_std": 0.34593039751052856, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401999473572, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1450892835855484, + "rewards/tag_count_reward/std": 0.16851899027824402, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1660.310302734375, + "completions/mean_terminated_length": 1109.1622314453125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.02557136007671408, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.08667027420443743, + "kl": 0.0006589889526367188, + "learning_rate": 2.531914893617021e-07, + "loss": 0.0783, + "num_tokens": 88430783.0, + "reward": 0.4860491156578064, + "reward_std": 0.3413888216018677, + "rewards/accuracy_reward/mean": 0.3325892984867096, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1534598171710968, + "rewards/tag_count_reward/std": 0.201433002948761, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1556.450927734375, + "completions/mean_terminated_length": 952.407958984375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.02578445474402003, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.40394700910506426, + "kl": 0.0006160736083984375, + "learning_rate": 2.5531914893617016e-07, + "loss": 0.121, + "num_tokens": 89201337.0, + "reward": 0.4598214626312256, + "reward_std": 0.36382588744163513, + "rewards/accuracy_reward/mean": 0.3214285671710968, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1383928507566452, + "rewards/tag_count_reward/std": 0.19669893383979797, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1499.9107666015625, + "completions/mean_terminated_length": 844.3529663085938, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.025997549411325982, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.21764557937746304, + "kl": 0.0005817413330078125, + "learning_rate": 2.574468085106383e-07, + "loss": 0.0724, + "num_tokens": 89940417.0, + "reward": 0.500558078289032, + "reward_std": 0.3537725806236267, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1411830335855484, + "rewards/tag_count_reward/std": 0.17302851378917694, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1537.9576416015625, + "completions/mean_terminated_length": 938.7815551757812, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.026210644078631933, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0998604700188912, + "kl": 0.0006914138793945312, + "learning_rate": 2.5957446808510637e-07, + "loss": 0.0948, + "num_tokens": 90706206.0, + "reward": 0.4224330484867096, + "reward_std": 0.38441112637519836, + "rewards/accuracy_reward/mean": 0.2790178656578064, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.18148697912693024, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1528.634033203125, + "completions/mean_terminated_length": 934.7176513671875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.026423738745937883, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.8214136488274478, + "kl": 0.0007534027099609375, + "learning_rate": 2.6170212765957445e-07, + "loss": 0.1294, + "num_tokens": 91453514.0, + "reward": 0.5412946939468384, + "reward_std": 0.3938583731651306, + "rewards/accuracy_reward/mean": 0.40046295523643494, + "rewards/accuracy_reward/std": 0.49056029319763184, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1551339328289032, + "rewards/tag_count_reward/std": 0.18974366784095764, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1619.35498046875, + "completions/mean_terminated_length": 975.18994140625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.026636833413243834, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09679444024126113, + "kl": 0.0006990432739257812, + "learning_rate": 2.6382978723404253e-07, + "loss": 0.0409, + "num_tokens": 92242329.0, + "reward": 0.396763414144516, + "reward_std": 0.31771764159202576, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.45011183619499207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1155133917927742, + "rewards/tag_count_reward/std": 0.16442348062992096, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1606.602783203125, + "completions/mean_terminated_length": 930.7909545898438, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.026849928080549784, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11909391566085942, + "kl": 0.0006961822509765625, + "learning_rate": 2.659574468085106e-07, + "loss": 0.0649, + "num_tokens": 93034135.0, + "reward": 0.4140625298023224, + "reward_std": 0.3400106132030487, + "rewards/accuracy_reward/mean": 0.2901785671710968, + "rewards/accuracy_reward/std": 0.4543520212173462, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1238839253783226, + "rewards/tag_count_reward/std": 0.16126208007335663, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1516.634033203125, + "completions/mean_terminated_length": 827.2205200195312, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.027063022747855735, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.0959436264047987, + "kl": 0.0006580352783203125, + "learning_rate": 2.680851063829787e-07, + "loss": 0.0871, + "num_tokens": 93788867.0, + "reward": 0.4285714626312256, + "reward_std": 0.39995086193084717, + "rewards/accuracy_reward/mean": 0.3058035671710968, + "rewards/accuracy_reward/std": 0.461262047290802, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1227678582072258, + "rewards/tag_count_reward/std": 0.16552923619747162, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1601.821533203125, + "completions/mean_terminated_length": 865.2307739257812, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.027276117415161685, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09881685227109833, + "kl": 0.0006542205810546875, + "learning_rate": 2.702127659574468e-07, + "loss": 0.116, + "num_tokens": 94576019.0, + "reward": 0.3710937798023224, + "reward_std": 0.3424123525619507, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.43349677324295044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.12109375, + "rewards/tag_count_reward/std": 0.17613907158374786, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1371.2523193359375, + "completions/mean_terminated_length": 868.2996215820312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.027489212082467636, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12308943825185102, + "kl": 0.0006923675537109375, + "learning_rate": 2.723404255319149e-07, + "loss": 0.0807, + "num_tokens": 95257508.0, + "reward": 0.6233259439468384, + "reward_std": 0.39112555980682373, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1813616007566452, + "rewards/tag_count_reward/std": 0.19688211381435394, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1442.232177734375, + "completions/mean_terminated_length": 868.0695190429688, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.027702306749773586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10037850525564837, + "kl": 0.000583648681640625, + "learning_rate": 2.74468085106383e-07, + "loss": 0.109, + "num_tokens": 95967964.0, + "reward": 0.5206473469734192, + "reward_std": 0.3918469548225403, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.1434151828289032, + "rewards/tag_count_reward/std": 0.16953729093074799, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1480.88623046875, + "completions/mean_terminated_length": 802.5735473632812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.027915401417079537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10545902003012357, + "kl": 0.0006513595581054688, + "learning_rate": 2.7659574468085106e-07, + "loss": 0.1391, + "num_tokens": 96700249.0, + "reward": 0.5078125, + "reward_std": 0.376852810382843, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1350446492433548, + "rewards/tag_count_reward/std": 0.16858935356140137, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1606.58935546875, + "completions/mean_terminated_length": 967.387939453125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.028128496084385488, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.5570782586541217, + "kl": 0.0007734298706054688, + "learning_rate": 2.7872340425531914e-07, + "loss": 0.0926, + "num_tokens": 97496401.0, + "reward": 0.3152901828289032, + "reward_std": 0.3668878674507141, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3907487094402313, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1277901828289032, + "rewards/tag_count_reward/std": 0.1721460223197937, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1222.1004638671875, + "completions/mean_terminated_length": 826.8680419921875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.028341590751691438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3014704904699505, + "kl": 0.0007181167602539062, + "learning_rate": 2.808510638297872e-07, + "loss": 0.1489, + "num_tokens": 98104638.0, + "reward": 0.6679688096046448, + "reward_std": 0.37830984592437744, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1746651828289032, + "rewards/tag_count_reward/std": 0.17229825258255005, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1480.9532470703125, + "completions/mean_terminated_length": 966.991455078125, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.02855468541899739, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0991900877404499, + "kl": 0.00064849853515625, + "learning_rate": 2.8297872340425535e-07, + "loss": 0.0876, + "num_tokens": 98839273.0, + "reward": 0.5306919813156128, + "reward_std": 0.3562542796134949, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1489955335855484, + "rewards/tag_count_reward/std": 0.1737341433763504, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1451.88623046875, + "completions/mean_terminated_length": 979.7640380859375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.02876778008630334, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11305584827542739, + "kl": 0.000728607177734375, + "learning_rate": 2.8510638297872343e-07, + "loss": 0.0675, + "num_tokens": 99555910.0, + "reward": 0.6707589626312256, + "reward_std": 0.41780179738998413, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1662946492433548, + "rewards/tag_count_reward/std": 0.17288866639137268, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1470.16748046875, + "completions/mean_terminated_length": 999.9473876953125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.02898087475360929, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10307618624591357, + "kl": 0.0007343292236328125, + "learning_rate": 2.872340425531915e-07, + "loss": 0.1195, + "num_tokens": 100281729.0, + "reward": 0.5686384439468384, + "reward_std": 0.3552924692630768, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.17578125, + "rewards/tag_count_reward/std": 0.18678203225135803, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1476.3795166015625, + "completions/mean_terminated_length": 851.33642578125, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.02919396942091524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11018190902562928, + "kl": 0.0007200241088867188, + "learning_rate": 2.8936170212765954e-07, + "loss": 0.078, + "num_tokens": 101012091.0, + "reward": 0.4799107313156128, + "reward_std": 0.3507802188396454, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1383928507566452, + "rewards/tag_count_reward/std": 0.1700088381767273, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1371.3907470703125, + "completions/mean_terminated_length": 949.7355346679688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.02940706408822119, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11145734046911678, + "kl": 0.0007352828979492188, + "learning_rate": 2.914893617021276e-07, + "loss": 0.0348, + "num_tokens": 101693306.0, + "reward": 0.6774553656578064, + "reward_std": 0.38402020931243896, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1841517835855484, + "rewards/tag_count_reward/std": 0.19390879571437836, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1589.919677734375, + "completions/mean_terminated_length": 944.6666870117188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.02962015875552714, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10102389689442862, + "kl": 0.0006847381591796875, + "learning_rate": 2.936170212765957e-07, + "loss": 0.0882, + "num_tokens": 102481718.0, + "reward": 0.4135044813156128, + "reward_std": 0.35761624574661255, + "rewards/accuracy_reward/mean": 0.2834821343421936, + "rewards/accuracy_reward/std": 0.4511922299861908, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1300223171710968, + "rewards/tag_count_reward/std": 0.1671494096517563, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1607.1607666015625, + "completions/mean_terminated_length": 1055.5577392578125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.029833253422833092, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13531349240595333, + "kl": 0.0006647109985351562, + "learning_rate": 2.957446808510638e-07, + "loss": 0.0794, + "num_tokens": 103270814.0, + "reward": 0.4375000298023224, + "reward_std": 0.3748447597026825, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45739173889160156, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.18328121304512024, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1502.3751220703125, + "completions/mean_terminated_length": 931.8355712890625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.030046348090139043, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10403643046517459, + "kl": 0.0007505416870117188, + "learning_rate": 2.978723404255319e-07, + "loss": 0.0955, + "num_tokens": 104012902.0, + "reward": 0.4955357313156128, + "reward_std": 0.3992388844490051, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1517857164144516, + "rewards/tag_count_reward/std": 0.17332497239112854, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1573.8304443359375, + "completions/mean_terminated_length": 1006.6863403320312, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.030259442757444997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12303349814944814, + "kl": 0.0007162094116210938, + "learning_rate": 3e-07, + "loss": 0.0961, + "num_tokens": 104791098.0, + "reward": 0.4771205484867096, + "reward_std": 0.3802442252635956, + "rewards/accuracy_reward/mean": 0.3191964328289032, + "rewards/accuracy_reward/std": 0.4666863977909088, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1579241007566452, + "rewards/tag_count_reward/std": 0.19438061118125916, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1450.3348388671875, + "completions/mean_terminated_length": 950.6474609375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.030472537424750947, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1059472942565495, + "kl": 0.0007429122924804688, + "learning_rate": 3.0212765957446807e-07, + "loss": 0.0753, + "num_tokens": 105511520.0, + "reward": 0.6356027126312256, + "reward_std": 0.3977864980697632, + "rewards/accuracy_reward/mean": 0.47685185074806213, + "rewards/accuracy_reward/std": 0.5000429749488831, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.17578125, + "rewards/tag_count_reward/std": 0.19839808344841003, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1413.8438720703125, + "completions/mean_terminated_length": 995.7703247070312, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.030685632092056898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10889084841332757, + "kl": 0.000713348388671875, + "learning_rate": 3.0425531914893615e-07, + "loss": 0.0773, + "num_tokens": 106217258.0, + "reward": 0.6462053656578064, + "reward_std": 0.3555229902267456, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1841517835855484, + "rewards/tag_count_reward/std": 0.18277306854724884, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1514.6563720703125, + "completions/mean_terminated_length": 971.7026977539062, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.03089872675936285, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09982227377646177, + "kl": 0.0007123947143554688, + "learning_rate": 3.063829787234042e-07, + "loss": 0.088, + "num_tokens": 106963776.0, + "reward": 0.535714328289032, + "reward_std": 0.3437630534172058, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1495535671710968, + "rewards/tag_count_reward/std": 0.17842154204845428, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1284.5826416015625, + "completions/mean_terminated_length": 776.5836181640625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.0311118214266688, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.5199829146786306, + "kl": 0.0014495849609375, + "learning_rate": 3.085106382978723e-07, + "loss": 0.1156, + "num_tokens": 107604533.0, + "reward": 0.5636160969734192, + "reward_std": 0.3381498456001282, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1863839328289032, + "rewards/tag_count_reward/std": 0.18733346462249756, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1435.5023193359375, + "completions/mean_terminated_length": 854.9608154296875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.031324916093974746, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15615433434201076, + "kl": 0.00084686279296875, + "learning_rate": 3.1063829787234044e-07, + "loss": 0.1369, + "num_tokens": 108314566.0, + "reward": 0.5217634439468384, + "reward_std": 0.3766101896762848, + "rewards/accuracy_reward/mean": 0.3705357015132904, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1512276828289032, + "rewards/tag_count_reward/std": 0.1917879283428192, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1588.509033203125, + "completions/mean_terminated_length": 1033.95068359375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.031538010761280696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09700597541344583, + "kl": 0.0007734298706054688, + "learning_rate": 3.127659574468085e-07, + "loss": 0.1141, + "num_tokens": 109097370.0, + "reward": 0.5128348469734192, + "reward_std": 0.34065452218055725, + "rewards/accuracy_reward/mean": 0.3705357015132904, + "rewards/accuracy_reward/std": 0.4834881126880646, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1422991007566452, + "rewards/tag_count_reward/std": 0.17372694611549377, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1526.5023193359375, + "completions/mean_terminated_length": 1049.5770263671875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.03175110542858665, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10050245436910464, + "kl": 0.0007734298706054688, + "learning_rate": 3.148936170212766e-07, + "loss": 0.109, + "num_tokens": 109847387.0, + "reward": 0.6305803656578064, + "reward_std": 0.3712863624095917, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1662946492433548, + "rewards/tag_count_reward/std": 0.1776748150587082, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1575.950927734375, + "completions/mean_terminated_length": 846.4204711914062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0319642000958926, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12137704010035566, + "kl": 0.0007715225219726562, + "learning_rate": 3.170212765957447e-07, + "loss": 0.072, + "num_tokens": 110629173.0, + "reward": 0.3431919813156128, + "reward_std": 0.31204429268836975, + "rewards/accuracy_reward/mean": 0.2075892835855484, + "rewards/accuracy_reward/std": 0.4060344398021698, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1356026828289032, + "rewards/tag_count_reward/std": 0.18889549374580383, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1409.247802734375, + "completions/mean_terminated_length": 916.9288940429688, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.03217729476319855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1022040412979308, + "kl": 0.0007123947143554688, + "learning_rate": 3.1914893617021275e-07, + "loss": 0.1035, + "num_tokens": 111330628.0, + "reward": 0.4921875298023224, + "reward_std": 0.32500773668289185, + "rewards/accuracy_reward/mean": 0.3102678656578064, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1819196492433548, + "rewards/tag_count_reward/std": 0.20370477437973022, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1384.4107666015625, + "completions/mean_terminated_length": 824.5925903320312, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.0323903894305045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10801845039896064, + "kl": 0.0007963180541992188, + "learning_rate": 3.2127659574468083e-07, + "loss": 0.1319, + "num_tokens": 112022108.0, + "reward": 0.520089328289032, + "reward_std": 0.3736415505409241, + "rewards/accuracy_reward/mean": 0.3482142984867096, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.171875, + "rewards/tag_count_reward/std": 0.19075748324394226, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1509.2545166015625, + "completions/mean_terminated_length": 1016.5556030273438, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.03260348409781045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10586276827109281, + "kl": 0.000843048095703125, + "learning_rate": 3.2340425531914897e-07, + "loss": 0.0882, + "num_tokens": 112765406.0, + "reward": 0.5619419813156128, + "reward_std": 0.3749977648258209, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1668526828289032, + "rewards/tag_count_reward/std": 0.18220780789852142, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1344.02685546875, + "completions/mean_terminated_length": 811.2157592773438, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.0328165787651164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12354788082657898, + "kl": 0.0008745193481445312, + "learning_rate": 3.2553191489361704e-07, + "loss": 0.1751, + "num_tokens": 113434410.0, + "reward": 0.5680803656578064, + "reward_std": 0.36260661482810974, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1863839328289032, + "rewards/tag_count_reward/std": 0.18733346462249756, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1332.165283203125, + "completions/mean_terminated_length": 790.3765258789062, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.03302967343242235, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10072773853337534, + "kl": 0.0007638931274414062, + "learning_rate": 3.2765957446808507e-07, + "loss": 0.1433, + "num_tokens": 114097444.0, + "reward": 0.5267857313156128, + "reward_std": 0.3183961808681488, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1540178507566452, + "rewards/tag_count_reward/std": 0.18918029963970184, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1328.790283203125, + "completions/mean_terminated_length": 808.7461547851562, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.0332427680997283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11892638203854576, + "kl": 0.000904083251953125, + "learning_rate": 3.2978723404255315e-07, + "loss": 0.0896, + "num_tokens": 114762710.0, + "reward": 0.582589328289032, + "reward_std": 0.35534539818763733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1763392835855484, + "rewards/tag_count_reward/std": 0.19754758477210999, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1416.794677734375, + "completions/mean_terminated_length": 844.6808471679688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03345586276703426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1094296697067676, + "kl": 0.000827789306640625, + "learning_rate": 3.3191489361702123e-07, + "loss": 0.1421, + "num_tokens": 115465082.0, + "reward": 0.486607164144516, + "reward_std": 0.4065491855144501, + "rewards/accuracy_reward/mean": 0.3214285671710968, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1651785671710968, + "rewards/tag_count_reward/std": 0.19300860166549683, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1278.243408203125, + "completions/mean_terminated_length": 871.0341186523438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.03366895743434021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11140758435887073, + "kl": 0.000904083251953125, + "learning_rate": 3.340425531914893e-07, + "loss": 0.0945, + "num_tokens": 116102375.0, + "reward": 0.6266741156578064, + "reward_std": 0.39421525597572327, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2003348171710968, + "rewards/tag_count_reward/std": 0.18820029497146606, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1166.5826416015625, + "completions/mean_terminated_length": 778.3054809570312, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.03388205210164616, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.8275997814768032, + "kl": 0.0020885467529296875, + "learning_rate": 3.3617021276595744e-07, + "loss": 0.1255, + "num_tokens": 116691260.0, + "reward": 0.7327009439468384, + "reward_std": 0.3838199973106384, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2081473171710968, + "rewards/tag_count_reward/std": 0.1966029405593872, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1379.5357666015625, + "completions/mean_terminated_length": 845.3011474609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.03409514676895211, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10448353642895948, + "kl": 0.0009469985961914062, + "learning_rate": 3.382978723404255e-07, + "loss": 0.1081, + "num_tokens": 117378204.0, + "reward": 0.5714285969734192, + "reward_std": 0.37186500430107117, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1919642835855484, + "rewards/tag_count_reward/std": 0.2186294049024582, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1436.43310546875, + "completions/mean_terminated_length": 901.6317749023438, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.03430824143625806, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10995124087304915, + "kl": 0.0010004043579101562, + "learning_rate": 3.404255319148936e-07, + "loss": 0.0847, + "num_tokens": 118091838.0, + "reward": 0.59765625, + "reward_std": 0.4327698051929474, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.17578125, + "rewards/tag_count_reward/std": 0.2025824636220932, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1471.9754638671875, + "completions/mean_terminated_length": 807.331787109375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03452133610356401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1087052374101123, + "kl": 0.0008993148803710938, + "learning_rate": 3.425531914893617e-07, + "loss": 0.089, + "num_tokens": 118829891.0, + "reward": 0.4185267984867096, + "reward_std": 0.34510934352874756, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44215917587280273, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1529017835855484, + "rewards/tag_count_reward/std": 0.1893485188484192, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1558.904052734375, + "completions/mean_terminated_length": 1056.5294189453125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.03473443077086996, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.0951884165325215, + "kl": 0.0008544921875, + "learning_rate": 3.4468085106382976e-07, + "loss": 0.0621, + "num_tokens": 119604184.0, + "reward": 0.598214328289032, + "reward_std": 0.39945685863494873, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1696428507566452, + "rewards/tag_count_reward/std": 0.17999590933322906, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1464.060302734375, + "completions/mean_terminated_length": 966.987548828125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.03494752543817591, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10062776057654353, + "kl": 0.0009670257568359375, + "learning_rate": 3.4680851063829784e-07, + "loss": 0.077, + "num_tokens": 120323235.0, + "reward": 0.5703125, + "reward_std": 0.34405753016471863, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1551339328289032, + "rewards/tag_count_reward/std": 0.1686781793832779, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1355.7701416015625, + "completions/mean_terminated_length": 761.19921875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03516062010548186, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 1.5572030012754672, + "kl": 0.004134178161621094, + "learning_rate": 3.4893617021276597e-07, + "loss": 0.1065, + "num_tokens": 121002828.0, + "reward": 0.5150669813156128, + "reward_std": 0.3479662537574768, + "rewards/accuracy_reward/mean": 0.3482142984867096, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1668526828289032, + "rewards/tag_count_reward/std": 0.1852518767118454, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1444.01123046875, + "completions/mean_terminated_length": 896.56591796875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03537371477278781, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10302955056637572, + "kl": 0.0010671615600585938, + "learning_rate": 3.5106382978723405e-07, + "loss": 0.0967, + "num_tokens": 121710881.0, + "reward": 0.5379464626312256, + "reward_std": 0.37933671474456787, + "rewards/accuracy_reward/mean": 0.3705357015132904, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1674107164144516, + "rewards/tag_count_reward/std": 0.19397637248039246, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1614.5179443359375, + "completions/mean_terminated_length": 986.7977905273438, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.035586809440093764, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10032984664205931, + "kl": 0.0009374618530273438, + "learning_rate": 3.5319148936170213e-07, + "loss": 0.0689, + "num_tokens": 122511385.0, + "reward": 0.416294664144516, + "reward_std": 0.3271806836128235, + "rewards/accuracy_reward/mean": 0.2767857015132904, + "rewards/accuracy_reward/std": 0.44790977239608765, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1395089328289032, + "rewards/tag_count_reward/std": 0.17155487835407257, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1462.7054443359375, + "completions/mean_terminated_length": 959.9834594726562, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.035799904107399715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09878559508081752, + "kl": 0.0010223388671875, + "learning_rate": 3.553191489361702e-07, + "loss": 0.0911, + "num_tokens": 123237749.0, + "reward": 0.5909598469734192, + "reward_std": 0.4037070870399475, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1802455335855484, + "rewards/tag_count_reward/std": 0.1921715885400772, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1550.55810546875, + "completions/mean_terminated_length": 1016.2685546875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.036012998774705665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12871041056750562, + "kl": 0.0010776519775390625, + "learning_rate": 3.574468085106383e-07, + "loss": 0.0877, + "num_tokens": 123999343.0, + "reward": 0.555245578289032, + "reward_std": 0.430169939994812, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.17578125, + "rewards/tag_count_reward/std": 0.18901443481445312, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1412.0201416015625, + "completions/mean_terminated_length": 921.8379516601562, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.036226093442011616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10024261913313043, + "kl": 0.0009641647338867188, + "learning_rate": 3.5957446808510637e-07, + "loss": 0.0877, + "num_tokens": 124710376.0, + "reward": 0.6757813096046448, + "reward_std": 0.3762159049510956, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2003348171710968, + "rewards/tag_count_reward/std": 0.21652869880199432, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1484.6585693359375, + "completions/mean_terminated_length": 950.7086791992188, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.036439188109317566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10476556008040915, + "kl": 0.0009899139404296875, + "learning_rate": 3.617021276595745e-07, + "loss": 0.1203, + "num_tokens": 125451983.0, + "reward": 0.482700914144516, + "reward_std": 0.37238675355911255, + "rewards/accuracy_reward/mean": 0.3013392984867096, + "rewards/accuracy_reward/std": 0.4593527019023895, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1813616007566452, + "rewards/tag_count_reward/std": 0.21585306525230408, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1568.58935546875, + "completions/mean_terminated_length": 911.6190185546875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.03665228277662352, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.09391397013883276, + "kl": 0.0009832382202148438, + "learning_rate": 3.638297872340426e-07, + "loss": 0.1185, + "num_tokens": 126225223.0, + "reward": 0.4252232313156128, + "reward_std": 0.31059473752975464, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44215917587280273, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1595982164144516, + "rewards/tag_count_reward/std": 0.19694946706295013, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1458.7054443359375, + "completions/mean_terminated_length": 919.77783203125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.03686537744392947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2044661745008977, + "kl": 0.0010404586791992188, + "learning_rate": 3.659574468085106e-07, + "loss": 0.1051, + "num_tokens": 126950115.0, + "reward": 0.4542410969734192, + "reward_std": 0.35335448384284973, + "rewards/accuracy_reward/mean": 0.2767857015132904, + "rewards/accuracy_reward/std": 0.44790980219841003, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1774553507566452, + "rewards/tag_count_reward/std": 0.19866611063480377, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1458.950927734375, + "completions/mean_terminated_length": 925.0467529296875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.03707847211123542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10181673395391216, + "kl": 0.0010509490966796875, + "learning_rate": 3.680851063829787e-07, + "loss": 0.098, + "num_tokens": 127672733.0, + "reward": 0.4877232313156128, + "reward_std": 0.3558724820613861, + "rewards/accuracy_reward/mean": 0.3080357015132904, + "rewards/accuracy_reward/std": 0.462197482585907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1796875, + "rewards/tag_count_reward/std": 0.21885482966899872, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1423.107177734375, + "completions/mean_terminated_length": 967.104248046875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.03729156677854137, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10329705877408646, + "kl": 0.0010509490966796875, + "learning_rate": 3.7021276595744676e-07, + "loss": 0.153, + "num_tokens": 128378301.0, + "reward": 0.6316964626312256, + "reward_std": 0.3447173535823822, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1763392835855484, + "rewards/tag_count_reward/std": 0.18662908673286438, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1437.8751220703125, + "completions/mean_terminated_length": 869.8275756835938, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.03750466144584732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14223293621782093, + "kl": 0.0011081695556640625, + "learning_rate": 3.7234042553191484e-07, + "loss": 0.0979, + "num_tokens": 129098501.0, + "reward": 0.551339328289032, + "reward_std": 0.4390011727809906, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1763392835855484, + "rewards/tag_count_reward/std": 0.20035871863365173, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1521.4732666015625, + "completions/mean_terminated_length": 908.4637451171875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.03771775611315327, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10621048236220616, + "kl": 0.0010623931884765625, + "learning_rate": 3.744680851063829e-07, + "loss": 0.0978, + "num_tokens": 129853849.0, + "reward": 0.4726562798023224, + "reward_std": 0.3519934415817261, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.46403056383132935, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.16015625, + "rewards/tag_count_reward/std": 0.19254150986671448, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1387.5023193359375, + "completions/mean_terminated_length": 809.912109375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03793085078045922, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1090509653471645, + "kl": 0.00127410888671875, + "learning_rate": 3.7659574468085106e-07, + "loss": 0.0839, + "num_tokens": 130542586.0, + "reward": 0.4977678656578064, + "reward_std": 0.33674728870391846, + "rewards/accuracy_reward/mean": 0.3191964328289032, + "rewards/accuracy_reward/std": 0.4666863977909088, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1785714328289032, + "rewards/tag_count_reward/std": 0.19766129553318024, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1453.8170166015625, + "completions/mean_terminated_length": 961.4938354492188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.03814394544776517, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10596649246298591, + "kl": 0.0011997222900390625, + "learning_rate": 3.7872340425531914e-07, + "loss": 0.1126, + "num_tokens": 131267480.0, + "reward": 0.6222098469734192, + "reward_std": 0.363572895526886, + "rewards/accuracy_reward/mean": 0.46759259700775146, + "rewards/accuracy_reward/std": 0.49952712655067444, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1713169664144516, + "rewards/tag_count_reward/std": 0.1732303947210312, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1402.009033203125, + "completions/mean_terminated_length": 821.7118530273438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03835704011507112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11570147599392522, + "kl": 0.0011806488037109375, + "learning_rate": 3.808510638297872e-07, + "loss": 0.1471, + "num_tokens": 131960428.0, + "reward": 0.4899553656578064, + "reward_std": 0.35651370882987976, + "rewards/accuracy_reward/mean": 0.3333333432674408, + "rewards/accuracy_reward/std": 0.47195106744766235, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1685267835855484, + "rewards/tag_count_reward/std": 0.18561963737010956, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1420.6719970703125, + "completions/mean_terminated_length": 886.6652221679688, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.03857013478237707, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10997003403263109, + "kl": 0.001247406005859375, + "learning_rate": 3.829787234042553e-07, + "loss": 0.1163, + "num_tokens": 132671193.0, + "reward": 0.5524553656578064, + "reward_std": 0.37749508023262024, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1685267835855484, + "rewards/tag_count_reward/std": 0.17949236929416656, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1397.524658203125, + "completions/mean_terminated_length": 900.7047119140625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.03878322944968302, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10221610579664953, + "kl": 0.001186370849609375, + "learning_rate": 3.8510638297872337e-07, + "loss": 0.1086, + "num_tokens": 133370180.0, + "reward": 0.5831473469734192, + "reward_std": 0.36309346556663513, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1880580335855484, + "rewards/tag_count_reward/std": 0.20119112730026245, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1366.118408203125, + "completions/mean_terminated_length": 882.0343627929688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.03899632411698897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10232054026371493, + "kl": 0.0011768341064453125, + "learning_rate": 3.8723404255319145e-07, + "loss": 0.0929, + "num_tokens": 134047513.0, + "reward": 0.6077009439468384, + "reward_std": 0.3802623450756073, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1925223171710968, + "rewards/tag_count_reward/std": 0.19691382348537445, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1442.3660888671875, + "completions/mean_terminated_length": 917.4833984375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.039209418784294923, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12095782636359531, + "kl": 0.0012035369873046875, + "learning_rate": 3.893617021276596e-07, + "loss": 0.1314, + "num_tokens": 134762861.0, + "reward": 0.555245578289032, + "reward_std": 0.32575860619544983, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1735491007566452, + "rewards/tag_count_reward/std": 0.19541189074516296, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1410.774658203125, + "completions/mean_terminated_length": 868.3429565429688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.039422513451600874, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09501894589540566, + "kl": 0.0010967254638671875, + "learning_rate": 3.9148936170212766e-07, + "loss": 0.1342, + "num_tokens": 135467432.0, + "reward": 0.559151828289032, + "reward_std": 0.3499615788459778, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1796875, + "rewards/tag_count_reward/std": 0.18940123915672302, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1304.6451416015625, + "completions/mean_terminated_length": 823.6507568359375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.039635608118906825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11156289833346619, + "kl": 0.0013427734375, + "learning_rate": 3.9361702127659574e-07, + "loss": 0.1297, + "num_tokens": 136117049.0, + "reward": 0.640625, + "reward_std": 0.3706938624382019, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2098214328289032, + "rewards/tag_count_reward/std": 0.21494384109973907, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1588.80810546875, + "completions/mean_terminated_length": 917.6813354492188, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.039848702786212775, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09302606053602087, + "kl": 0.0011444091796875, + "learning_rate": 3.957446808510638e-07, + "loss": 0.1349, + "num_tokens": 136892259.0, + "reward": 0.3989955484867096, + "reward_std": 0.33329832553863525, + "rewards/accuracy_reward/mean": 0.2522321343421936, + "rewards/accuracy_reward/std": 0.4347792863845825, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1467633992433548, + "rewards/tag_count_reward/std": 0.18341653048992157, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1513.9442138671875, + "completions/mean_terminated_length": 1029.8851318359375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.040061797453518726, + "frac_reward_zero_std": 0.0, + "grad_norm": 26.97107691584283, + "kl": 0.43701171875, + "learning_rate": 3.978723404255319e-07, + "loss": 0.1323, + "num_tokens": 137643946.0, + "reward": 0.5691964626312256, + "reward_std": 0.4128606915473938, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1897321492433548, + "rewards/tag_count_reward/std": 0.19995954632759094, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1372.8126220703125, + "completions/mean_terminated_length": 893.4808959960938, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.040274892120824676, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10437976458759893, + "kl": 0.001232147216796875, + "learning_rate": 4e-07, + "loss": 0.1313, + "num_tokens": 138321830.0, + "reward": 0.6858259439468384, + "reward_std": 0.4512696862220764, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1880580335855484, + "rewards/tag_count_reward/std": 0.20049497485160828, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1419.5982666015625, + "completions/mean_terminated_length": 870.0753173828125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.04048798678813063, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10374834102028774, + "kl": 0.001277923583984375, + "learning_rate": 4.021276595744681e-07, + "loss": 0.1116, + "num_tokens": 139033826.0, + "reward": 0.5301339626312256, + "reward_std": 0.41140657663345337, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1573660671710968, + "rewards/tag_count_reward/std": 0.18714678287506104, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1384.44873046875, + "completions/mean_terminated_length": 814.5104370117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.04070108145543658, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10744661867463708, + "kl": 0.0012378692626953125, + "learning_rate": 4.0425531914893614e-07, + "loss": 0.1451, + "num_tokens": 139725307.0, + "reward": 0.5078125, + "reward_std": 0.3229517936706543, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1774553507566452, + "rewards/tag_count_reward/std": 0.19583068788051605, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1510.5804443359375, + "completions/mean_terminated_length": 825.8477172851562, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.04091417612274253, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10491696435039138, + "kl": 0.001186370849609375, + "learning_rate": 4.063829787234042e-07, + "loss": 0.1181, + "num_tokens": 140478639.0, + "reward": 0.4977678656578064, + "reward_std": 0.40886130928993225, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.15625, + "rewards/tag_count_reward/std": 0.20309406518936157, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1342.15625, + "completions/mean_terminated_length": 854.7245483398438, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.04112727079004848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2116215064290924, + "kl": 0.0012226104736328125, + "learning_rate": 4.085106382978723e-07, + "loss": 0.1273, + "num_tokens": 141154085.0, + "reward": 0.668526828289032, + "reward_std": 0.43462803959846497, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2042410671710968, + "rewards/tag_count_reward/std": 0.2138228863477707, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1264.0179443359375, + "completions/mean_terminated_length": 832.6920776367188, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.04134036545735443, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12045199991956956, + "kl": 0.0015106201171875, + "learning_rate": 4.106382978723404e-07, + "loss": 0.1592, + "num_tokens": 141789277.0, + "reward": 0.6640625, + "reward_std": 0.381980836391449, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2176339328289032, + "rewards/tag_count_reward/std": 0.21431276202201843, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1439.227783203125, + "completions/mean_terminated_length": 877.4849853515625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.04155346012466038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10252329126161434, + "kl": 0.0013751983642578125, + "learning_rate": 4.1276595744680846e-07, + "loss": 0.1338, + "num_tokens": 142507475.0, + "reward": 0.5418527126312256, + "reward_std": 0.44359689950942993, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1780133992433548, + "rewards/tag_count_reward/std": 0.21147681772708893, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1302.9576416015625, + "completions/mean_terminated_length": 793.1917724609375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04176655479196633, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10496685851326004, + "kl": 0.0014324188232421875, + "learning_rate": 4.148936170212766e-07, + "loss": 0.1119, + "num_tokens": 143163264.0, + "reward": 0.6155134439468384, + "reward_std": 0.42997801303863525, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1936383992433548, + "rewards/tag_count_reward/std": 0.2096090316772461, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1455.091552734375, + "completions/mean_terminated_length": 823.9308471679688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.04197964945927228, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09922929912706371, + "kl": 0.00128936767578125, + "learning_rate": 4.1702127659574467e-07, + "loss": 0.1415, + "num_tokens": 143890249.0, + "reward": 0.5301339626312256, + "reward_std": 0.3826269507408142, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100292682647705, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1685267835855484, + "rewards/tag_count_reward/std": 0.19801151752471924, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1320.82373046875, + "completions/mean_terminated_length": 854.6849975585938, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.04219274412657823, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10871571318377708, + "kl": 0.0014801025390625, + "learning_rate": 4.1914893617021275e-07, + "loss": 0.1487, + "num_tokens": 144551450.0, + "reward": 0.6696428656578064, + "reward_std": 0.41537654399871826, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1830357164144516, + "rewards/tag_count_reward/std": 0.18765638768672943, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1397.977783203125, + "completions/mean_terminated_length": 932.2528686523438, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.04240583879388418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10278064122495711, + "kl": 0.0013484954833984375, + "learning_rate": 4.2127659574468083e-07, + "loss": 0.1236, + "num_tokens": 145250672.0, + "reward": 0.66015625, + "reward_std": 0.35941562056541443, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1936383992433548, + "rewards/tag_count_reward/std": 0.19723689556121826, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1329.415283203125, + "completions/mean_terminated_length": 749.9112548828125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.04261893346119013, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11291643137015937, + "kl": 0.001422882080078125, + "learning_rate": 4.234042553191489e-07, + "loss": 0.1212, + "num_tokens": 145917818.0, + "reward": 0.4693080484867096, + "reward_std": 0.35822468996047974, + "rewards/accuracy_reward/mean": 0.3194444477558136, + "rewards/accuracy_reward/std": 0.4668020009994507, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1612723171710968, + "rewards/tag_count_reward/std": 0.20086821913719177, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1432.060302734375, + "completions/mean_terminated_length": 990.7547607421875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.04283202812849608, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10012160763790595, + "kl": 0.00139617919921875, + "learning_rate": 4.25531914893617e-07, + "loss": 0.1277, + "num_tokens": 146626437.0, + "reward": 0.60546875, + "reward_std": 0.3882697522640228, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.19921875, + "rewards/tag_count_reward/std": 0.20292727649211884, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1527.0469970703125, + "completions/mean_terminated_length": 957.406494140625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.043045122795802034, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10127694106650852, + "kl": 0.0013141632080078125, + "learning_rate": 4.276595744680851e-07, + "loss": 0.1074, + "num_tokens": 147387546.0, + "reward": 0.4988839626312256, + "reward_std": 0.3934634327888489, + "rewards/accuracy_reward/mean": 0.3325892984867096, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1662946492433548, + "rewards/tag_count_reward/std": 0.20744557678699493, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1350.497802734375, + "completions/mean_terminated_length": 873.2593994140625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.043258217463107984, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10468293347398858, + "kl": 0.001506805419921875, + "learning_rate": 4.297872340425532e-07, + "loss": 0.0818, + "num_tokens": 148060089.0, + "reward": 0.58203125, + "reward_std": 0.38847699761390686, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1891741007566452, + "rewards/tag_count_reward/std": 0.20291495323181152, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1348.18310546875, + "completions/mean_terminated_length": 842.1615600585938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.043471312130413935, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11401171455710374, + "kl": 0.0015716552734375, + "learning_rate": 4.319148936170213e-07, + "loss": 0.1542, + "num_tokens": 148731899.0, + "reward": 0.6674107313156128, + "reward_std": 0.41716843843460083, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2053571492433548, + "rewards/tag_count_reward/std": 0.2204943597316742, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1512.7701416015625, + "completions/mean_terminated_length": 953.1004028320312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.043684406797719885, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.156304426547695, + "kl": 0.200439453125, + "learning_rate": 4.3404255319148936e-07, + "loss": 0.1376, + "num_tokens": 149483876.0, + "reward": 0.4871652126312256, + "reward_std": 0.34277215600013733, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.46403056383132935, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1746651828289032, + "rewards/tag_count_reward/std": 0.2069537490606308, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1311.216552734375, + "completions/mean_terminated_length": 753.5725708007812, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.043897501465025836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11912765987063775, + "kl": 0.00160980224609375, + "learning_rate": 4.3617021276595744e-07, + "loss": 0.1512, + "num_tokens": 150145973.0, + "reward": 0.6702009439468384, + "reward_std": 0.38861900568008423, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1902901828289032, + "rewards/tag_count_reward/std": 0.19409781694412231, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1315.0692138671875, + "completions/mean_terminated_length": 871.1075439453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.044110596132331786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22304967037251217, + "kl": 0.001598358154296875, + "learning_rate": 4.382978723404255e-07, + "loss": 0.1081, + "num_tokens": 150805540.0, + "reward": 0.61328125, + "reward_std": 0.41968443989753723, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1958705335855484, + "rewards/tag_count_reward/std": 0.1999712586402893, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1439.07373046875, + "completions/mean_terminated_length": 925.370361328125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04432369079963774, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11242731061780704, + "kl": 0.0016345977783203125, + "learning_rate": 4.404255319148936e-07, + "loss": 0.1042, + "num_tokens": 151517813.0, + "reward": 0.5887277126312256, + "reward_std": 0.4099920988082886, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.20703125, + "rewards/tag_count_reward/std": 0.22860509157180786, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1487.9442138671875, + "completions/mean_terminated_length": 927.888427734375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.04453678546694369, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09382352806250878, + "kl": 0.0014190673828125, + "learning_rate": 4.425531914893617e-07, + "loss": 0.0891, + "num_tokens": 152265724.0, + "reward": 0.5758928656578064, + "reward_std": 0.3854631185531616, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1919642835855484, + "rewards/tag_count_reward/std": 0.2211727499961853, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1356.9107666015625, + "completions/mean_terminated_length": 913.90478515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.04474988013424964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11980874924488194, + "kl": 0.0016326904296875, + "learning_rate": 4.4468085106382975e-07, + "loss": 0.1465, + "num_tokens": 152940100.0, + "reward": 0.7003348469734192, + "reward_std": 0.37838613986968994, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2159598171710968, + "rewards/tag_count_reward/std": 0.2104293406009674, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1390.9844970703125, + "completions/mean_terminated_length": 902.6964721679688, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.04496297480155559, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11965225096943578, + "kl": 0.00153350830078125, + "learning_rate": 4.4680851063829783e-07, + "loss": 0.0594, + "num_tokens": 153625533.0, + "reward": 0.6640625, + "reward_std": 0.41250860691070557, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2109375, + "rewards/tag_count_reward/std": 0.20034313201904297, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1366.4376220703125, + "completions/mean_terminated_length": 961.3807983398438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.04517606946886154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11557599958263355, + "kl": 0.0016002655029296875, + "learning_rate": 4.489361702127659e-07, + "loss": 0.1284, + "num_tokens": 154304433.0, + "reward": 0.6428571939468384, + "reward_std": 0.4596438705921173, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2075892835855484, + "rewards/tag_count_reward/std": 0.1989581137895584, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1297.5067138671875, + "completions/mean_terminated_length": 816.4212646484375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.04538916413616749, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11110708060337988, + "kl": 0.00167083740234375, + "learning_rate": 4.51063829787234e-07, + "loss": 0.1593, + "num_tokens": 154948516.0, + "reward": 0.6434152126312256, + "reward_std": 0.4187160134315491, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.21484375, + "rewards/tag_count_reward/std": 0.22871975600719452, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1404.7344970703125, + "completions/mean_terminated_length": 939.6038208007812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04560225880347344, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10196443242568666, + "kl": 0.00173187255859375, + "learning_rate": 4.5319148936170207e-07, + "loss": 0.0848, + "num_tokens": 155643757.0, + "reward": 0.6238839626312256, + "reward_std": 0.3447335958480835, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1863839328289032, + "rewards/tag_count_reward/std": 0.18955937027931213, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1305.90625, + "completions/mean_terminated_length": 860.6500244140625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04581535347077939, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11670707779154094, + "kl": 0.001613616943359375, + "learning_rate": 4.553191489361702e-07, + "loss": 0.0844, + "num_tokens": 156303331.0, + "reward": 0.609933078289032, + "reward_std": 0.40606167912483215, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.21484375, + "rewards/tag_count_reward/std": 0.2122310847043991, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1245.2879638671875, + "completions/mean_terminated_length": 828.966064453125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.04602844813808534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1117023704373016, + "kl": 0.0017108917236328125, + "learning_rate": 4.574468085106383e-07, + "loss": 0.1098, + "num_tokens": 156931108.0, + "reward": 0.6869419813156128, + "reward_std": 0.42914876341819763, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401999473572, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2137276828289032, + "rewards/tag_count_reward/std": 0.19629153609275818, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1352.977783203125, + "completions/mean_terminated_length": 873.0188598632812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.04624154280539129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10908169488479595, + "kl": 0.0015811920166015625, + "learning_rate": 4.5957446808510636e-07, + "loss": 0.0976, + "num_tokens": 157599354.0, + "reward": 0.6233259439468384, + "reward_std": 0.41876834630966187, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1969866007566452, + "rewards/tag_count_reward/std": 0.20373158156871796, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1310.982177734375, + "completions/mean_terminated_length": 737.74609375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.04645463747269724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13422275796443275, + "kl": 0.0016231536865234375, + "learning_rate": 4.6170212765957444e-07, + "loss": 0.1512, + "num_tokens": 158250514.0, + "reward": 0.5217634439468384, + "reward_std": 0.3856152892112732, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1869419664144516, + "rewards/tag_count_reward/std": 0.1959095597267151, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1323.37060546875, + "completions/mean_terminated_length": 912.9160766601562, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.04666773214000319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1491887727377492, + "kl": 0.0015430450439453125, + "learning_rate": 4.638297872340425e-07, + "loss": 0.1082, + "num_tokens": 158908072.0, + "reward": 0.7265625596046448, + "reward_std": 0.4064008891582489, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2220982164144516, + "rewards/tag_count_reward/std": 0.20698769390583038, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1476.107177734375, + "completions/mean_terminated_length": 1002.2529907226562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.04688082680730915, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40464057792231767, + "kl": 0.007049560546875, + "learning_rate": 4.659574468085106e-07, + "loss": 0.1002, + "num_tokens": 159645704.0, + "reward": 0.6155134439468384, + "reward_std": 0.3887866735458374, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.19140625, + "rewards/tag_count_reward/std": 0.19658386707305908, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1428.685302734375, + "completions/mean_terminated_length": 1008.8502197265625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.0470939214746151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10143971225602694, + "kl": 0.0016422271728515625, + "learning_rate": 4.6808510638297873e-07, + "loss": 0.0794, + "num_tokens": 160358987.0, + "reward": 0.6852678656578064, + "reward_std": 0.4163140654563904, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2165178507566452, + "rewards/tag_count_reward/std": 0.22183778882026672, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1406.0357666015625, + "completions/mean_terminated_length": 813.6652221679688, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.04730701614192105, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.0996447702921371, + "kl": 0.00140380859375, + "learning_rate": 4.702127659574468e-07, + "loss": 0.111, + "num_tokens": 161054443.0, + "reward": 0.4910714626312256, + "reward_std": 0.3968755006790161, + "rewards/accuracy_reward/mean": 0.3080357015132904, + "rewards/accuracy_reward/std": 0.46219751238822937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1830357164144516, + "rewards/tag_count_reward/std": 0.19639405608177185, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1249.74560546875, + "completions/mean_terminated_length": 775.3380737304688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.047520110809227, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11154026812212577, + "kl": 0.0016765594482421875, + "learning_rate": 4.723404255319149e-07, + "loss": 0.1118, + "num_tokens": 161679465.0, + "reward": 0.6328125, + "reward_std": 0.3738306760787964, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1819196492433548, + "rewards/tag_count_reward/std": 0.17885135114192963, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1425.3148193359375, + "completions/mean_terminated_length": 987.30419921875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.04773320547653295, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.22204444435954832, + "kl": 0.003631591796875, + "learning_rate": 4.7446808510638297e-07, + "loss": 0.1246, + "num_tokens": 162393254.0, + "reward": 0.6718750596046448, + "reward_std": 0.387226939201355, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.203125, + "rewards/tag_count_reward/std": 0.20283572375774384, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1356.352783203125, + "completions/mean_terminated_length": 856.2384643554688, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.0479463001438389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16879309262851652, + "kl": 0.0016574859619140625, + "learning_rate": 4.7659574468085105e-07, + "loss": 0.1004, + "num_tokens": 163068660.0, + "reward": 0.5870535969734192, + "reward_std": 0.39906179904937744, + "rewards/accuracy_reward/mean": 0.38657405972480774, + "rewards/accuracy_reward/std": 0.4875292479991913, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2142857164144516, + "rewards/tag_count_reward/std": 0.22832709550857544, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1304.0179443359375, + "completions/mean_terminated_length": 866.0709228515625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.048159394811144854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11507590868466241, + "kl": 0.0015659332275390625, + "learning_rate": 4.787234042553192e-07, + "loss": 0.098, + "num_tokens": 163723852.0, + "reward": 0.54296875, + "reward_std": 0.370231568813324, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2036830335855484, + "rewards/tag_count_reward/std": 0.20806674659252167, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1363.09375, + "completions/mean_terminated_length": 876.8626098632812, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.048372489478450804, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10271745026307122, + "kl": 0.00164794921875, + "learning_rate": 4.808510638297872e-07, + "loss": 0.1139, + "num_tokens": 164405494.0, + "reward": 0.5870535969734192, + "reward_std": 0.34285223484039307, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2008928507566452, + "rewards/tag_count_reward/std": 0.20505164563655853, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1392.8438720703125, + "completions/mean_terminated_length": 896.98046875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.048585584145756755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11303936666548152, + "kl": 0.0016937255859375, + "learning_rate": 4.829787234042552e-07, + "loss": 0.1239, + "num_tokens": 165100912.0, + "reward": 0.5915178656578064, + "reward_std": 0.330929160118103, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2165178507566452, + "rewards/tag_count_reward/std": 0.23109875619411469, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1426.247802734375, + "completions/mean_terminated_length": 929.3453369140625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.048798678813062706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8870956516781585, + "kl": 0.0076580047607421875, + "learning_rate": 4.851063829787234e-07, + "loss": 0.1046, + "num_tokens": 165819263.0, + "reward": 0.590401828289032, + "reward_std": 0.4257414937019348, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2020089328289032, + "rewards/tag_count_reward/std": 0.2113564908504486, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1404.247802734375, + "completions/mean_terminated_length": 1010.5863647460938, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.049011773480368656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10398513362142894, + "kl": 0.0017681121826171875, + "learning_rate": 4.872340425531915e-07, + "loss": 0.1286, + "num_tokens": 166520574.0, + "reward": 0.6579241156578064, + "reward_std": 0.413931667804718, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2248883992433548, + "rewards/tag_count_reward/std": 0.22010265290737152, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1244.966552734375, + "completions/mean_terminated_length": 828.4779663085938, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.04922486814767461, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11912494459626313, + "kl": 0.0019626617431640625, + "learning_rate": 4.893617021276595e-07, + "loss": 0.18, + "num_tokens": 167142687.0, + "reward": 0.707589328289032, + "reward_std": 0.45271551609039307, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2276785671710968, + "rewards/tag_count_reward/std": 0.21034106612205505, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1387.950927734375, + "completions/mean_terminated_length": 865.1920166015625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.04943796281498056, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.09625235051488164, + "kl": 0.001678466796875, + "learning_rate": 4.914893617021277e-07, + "loss": 0.1264, + "num_tokens": 167830249.0, + "reward": 0.5345982313156128, + "reward_std": 0.42067062854766846, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1908482164144516, + "rewards/tag_count_reward/std": 0.2016845941543579, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1409.7054443359375, + "completions/mean_terminated_length": 984.966552734375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.04965105748228651, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.22790115880219, + "kl": 0.05279541015625, + "learning_rate": 4.936170212765957e-07, + "loss": 0.1316, + "num_tokens": 168538341.0, + "reward": 0.590401828289032, + "reward_std": 0.3608662784099579, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2064732164144516, + "rewards/tag_count_reward/std": 0.18712009489536285, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1324.9910888671875, + "completions/mean_terminated_length": 934.9141235351562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.04986415214959246, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10245777746963397, + "kl": 0.0017719268798828125, + "learning_rate": 4.957446808510638e-07, + "loss": 0.1033, + "num_tokens": 169198449.0, + "reward": 0.609375, + "reward_std": 0.3380663990974426, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2142857164144516, + "rewards/tag_count_reward/std": 0.207135409116745, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1462.149658203125, + "completions/mean_terminated_length": 881.5067138671875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.05007724681689841, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10375067384388459, + "kl": 0.001827239990234375, + "learning_rate": 4.978723404255318e-07, + "loss": 0.0947, + "num_tokens": 169930804.0, + "reward": 0.5915178656578064, + "reward_std": 0.3607367277145386, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2098214328289032, + "rewards/tag_count_reward/std": 0.22261303663253784, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1387.46435546875, + "completions/mean_terminated_length": 914.2068481445312, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.05029034148420436, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10549517129023503, + "kl": 0.001728057861328125, + "learning_rate": 5e-07, + "loss": 0.1101, + "num_tokens": 170619508.0, + "reward": 0.645089328289032, + "reward_std": 0.41320767998695374, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2098214328289032, + "rewards/tag_count_reward/std": 0.22008632123470306, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1505.732177734375, + "completions/mean_terminated_length": 1056.4244384765625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.05050343615151031, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10106960498822302, + "kl": 0.0016994476318359375, + "learning_rate": 5.02127659574468e-07, + "loss": 0.0977, + "num_tokens": 171365132.0, + "reward": 0.51171875, + "reward_std": 0.4081592857837677, + "rewards/accuracy_reward/mean": 0.3013392984867096, + "rewards/accuracy_reward/std": 0.4593527019023895, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2103794664144516, + "rewards/tag_count_reward/std": 0.2322404831647873, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1284.4866943359375, + "completions/mean_terminated_length": 847.810546875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.05071653081881626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12018573210774196, + "kl": 0.002071380615234375, + "learning_rate": 5.042553191489361e-07, + "loss": 0.0961, + "num_tokens": 172004150.0, + "reward": 0.6261160969734192, + "reward_std": 0.368063747882843, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2154017835855484, + "rewards/tag_count_reward/std": 0.20460976660251617, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1364.388427734375, + "completions/mean_terminated_length": 922.0514526367188, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.05092962548612221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20252276414839693, + "kl": 0.0020904541015625, + "learning_rate": 5.063829787234042e-07, + "loss": 0.1213, + "num_tokens": 172681172.0, + "reward": 0.617745578289032, + "reward_std": 0.3969407379627228, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2315848171710968, + "rewards/tag_count_reward/std": 0.23128168284893036, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1396.1942138671875, + "completions/mean_terminated_length": 884.6175537109375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.05114272015342816, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10280922103701856, + "kl": 0.0018596649169921875, + "learning_rate": 5.085106382978723e-07, + "loss": 0.1412, + "num_tokens": 173370619.0, + "reward": 0.5189732313156128, + "reward_std": 0.35032397508621216, + "rewards/accuracy_reward/mean": 0.3102678656578064, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2087053507566452, + "rewards/tag_count_reward/std": 0.22921931743621826, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1334.212158203125, + "completions/mean_terminated_length": 813.3397827148438, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.05135581482073411, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10163979671398084, + "kl": 0.0020809173583984375, + "learning_rate": 5.106382978723403e-07, + "loss": 0.1552, + "num_tokens": 174038906.0, + "reward": 0.6501116156578064, + "reward_std": 0.4021851718425751, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.23046875, + "rewards/tag_count_reward/std": 0.23950733244419098, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1392.9107666015625, + "completions/mean_terminated_length": 840.2633666992188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05156890948804006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09688813276520299, + "kl": 0.0017910003662109375, + "learning_rate": 5.127659574468085e-07, + "loss": 0.1088, + "num_tokens": 174730418.0, + "reward": 0.5340402126312256, + "reward_std": 0.3654673397541046, + "rewards/accuracy_reward/mean": 0.3214285671710968, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2126116007566452, + "rewards/tag_count_reward/std": 0.22341284155845642, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1415.7991943359375, + "completions/mean_terminated_length": 945.9533081054688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05178200415534601, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.083676560633267, + "kl": 0.0262298583984375, + "learning_rate": 5.148936170212766e-07, + "loss": 0.0713, + "num_tokens": 175436376.0, + "reward": 0.5496652126312256, + "reward_std": 0.3902077376842499, + "rewards/accuracy_reward/mean": 0.35185185074806213, + "rewards/accuracy_reward/std": 0.4781017303466797, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2103794664144516, + "rewards/tag_count_reward/std": 0.21472229063510895, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1399.1407470703125, + "completions/mean_terminated_length": 916.9143676757812, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.051995098822651964, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11236993357710642, + "kl": 0.001888275146484375, + "learning_rate": 5.170212765957447e-07, + "loss": 0.082, + "num_tokens": 176131559.0, + "reward": 0.6696428656578064, + "reward_std": 0.4155312478542328, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2299107164144516, + "rewards/tag_count_reward/std": 0.23504102230072021, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1472.3013916015625, + "completions/mean_terminated_length": 916.8026123046875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.052208193489957914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0985172887983217, + "kl": 0.0018062591552734375, + "learning_rate": 5.191489361702127e-07, + "loss": 0.1042, + "num_tokens": 176868382.0, + "reward": 0.5172991156578064, + "reward_std": 0.39164578914642334, + "rewards/accuracy_reward/mean": 0.3080357015132904, + "rewards/accuracy_reward/std": 0.46219751238822937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2092633992433548, + "rewards/tag_count_reward/std": 0.2489084005355835, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1364.2366943359375, + "completions/mean_terminated_length": 909.2416381835938, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.052421288157263865, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10421536892634764, + "kl": 0.0019969940185546875, + "learning_rate": 5.212765957446809e-07, + "loss": 0.115, + "num_tokens": 177553192.0, + "reward": 0.6579241156578064, + "reward_std": 0.4745548963546753, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2449776828289032, + "rewards/tag_count_reward/std": 0.2499494105577469, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1244.046875, + "completions/mean_terminated_length": 752.4208984375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.052634382824569816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11961361883166327, + "kl": 0.002269744873046875, + "learning_rate": 5.234042553191489e-07, + "loss": 0.15, + "num_tokens": 178172957.0, + "reward": 0.625558078289032, + "reward_std": 0.38029906153678894, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2193080335855484, + "rewards/tag_count_reward/std": 0.23656509816646576, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1305.4375, + "completions/mean_terminated_length": 811.3159790039062, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.052847477491875766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10489620471694665, + "kl": 0.002223968505859375, + "learning_rate": 5.25531914893617e-07, + "loss": 0.1139, + "num_tokens": 178825393.0, + "reward": 0.6579241156578064, + "reward_std": 0.42199236154556274, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.25390625, + "rewards/tag_count_reward/std": 0.25822341442108154, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1375.1563720703125, + "completions/mean_terminated_length": 875.1050415039062, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.05306057215918172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12310158123222194, + "kl": 0.0020694732666015625, + "learning_rate": 5.276595744680851e-07, + "loss": 0.0738, + "num_tokens": 179513991.0, + "reward": 0.6166294813156128, + "reward_std": 0.44393911957740784, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2349330335855484, + "rewards/tag_count_reward/std": 0.24215105175971985, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1395.6585693359375, + "completions/mean_terminated_length": 879.0040283203125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.05327366682648767, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10143866384812297, + "kl": 0.0019588470458984375, + "learning_rate": 5.297872340425532e-07, + "loss": 0.0952, + "num_tokens": 180207038.0, + "reward": 0.5993303656578064, + "reward_std": 0.3879188299179077, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2109375, + "rewards/tag_count_reward/std": 0.2190144956111908, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1411.87060546875, + "completions/mean_terminated_length": 960.2671508789062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.05348676149379362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1127356505588024, + "kl": 0.002155303955078125, + "learning_rate": 5.319148936170212e-07, + "loss": 0.0951, + "num_tokens": 180912340.0, + "reward": 0.5290178656578064, + "reward_std": 0.34892863035202026, + "rewards/accuracy_reward/mean": 0.3333333432674408, + "rewards/accuracy_reward/std": 0.47195106744766235, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2075892835855484, + "rewards/tag_count_reward/std": 0.22655968368053436, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1337.5804443359375, + "completions/mean_terminated_length": 837.8555297851562, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.05369985616109957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10831817968873333, + "kl": 0.002429962158203125, + "learning_rate": 5.340425531914894e-07, + "loss": 0.1064, + "num_tokens": 181574008.0, + "reward": 0.6004464626312256, + "reward_std": 0.36334195733070374, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.234375, + "rewards/tag_count_reward/std": 0.24182668328285217, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1428.93310546875, + "completions/mean_terminated_length": 968.8482666015625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.05391295082840552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10370107091258478, + "kl": 0.0022106170654296875, + "learning_rate": 5.361702127659574e-07, + "loss": 0.1416, + "num_tokens": 182285594.0, + "reward": 0.6428571939468384, + "reward_std": 0.3974340856075287, + "rewards/accuracy_reward/mean": 0.4305555522441864, + "rewards/accuracy_reward/std": 0.495728075504303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2276785671710968, + "rewards/tag_count_reward/std": 0.24071939289569855, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1354.828125, + "completions/mean_terminated_length": 780.4856567382812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05412604549571147, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.574807841611994, + "kl": 0.0021686553955078125, + "learning_rate": 5.382978723404255e-07, + "loss": 0.1137, + "num_tokens": 182964333.0, + "reward": 0.6082589626312256, + "reward_std": 0.4060520529747009, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2087053507566452, + "rewards/tag_count_reward/std": 0.23464499413967133, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1302.5513916015625, + "completions/mean_terminated_length": 888.4132080078125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05433914016301742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1132689062105518, + "kl": 0.0024566650390625, + "learning_rate": 5.404255319148936e-07, + "loss": 0.1481, + "num_tokens": 183617268.0, + "reward": 0.6534598469734192, + "reward_std": 0.38801801204681396, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2516741156578064, + "rewards/tag_count_reward/std": 0.25055304169654846, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1307.274658203125, + "completions/mean_terminated_length": 858.5913696289062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.05455223483032337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10981357267287487, + "kl": 0.0023651123046875, + "learning_rate": 5.425531914893617e-07, + "loss": 0.1381, + "num_tokens": 184268719.0, + "reward": 0.7204241156578064, + "reward_std": 0.4415828585624695, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2516741156578064, + "rewards/tag_count_reward/std": 0.23737114667892456, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1449.47998046875, + "completions/mean_terminated_length": 944.5555419921875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05476532949762932, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11774942896434641, + "kl": 0.00235748291015625, + "learning_rate": 5.446808510638298e-07, + "loss": 0.1221, + "num_tokens": 184995782.0, + "reward": 0.5954241156578064, + "reward_std": 0.40222373604774475, + "rewards/accuracy_reward/mean": 0.3958333432674408, + "rewards/accuracy_reward/std": 0.4895959198474884, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2137276828289032, + "rewards/tag_count_reward/std": 0.23695528507232666, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1398.2254638671875, + "completions/mean_terminated_length": 874.2136840820312, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.05497842416493527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0722756405134082, + "kl": 0.00913238525390625, + "learning_rate": 5.468085106382978e-07, + "loss": 0.103, + "num_tokens": 185705531.0, + "reward": 0.498325914144516, + "reward_std": 0.38193479180336, + "rewards/accuracy_reward/mean": 0.2777777910232544, + "rewards/accuracy_reward/std": 0.44842249155044556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.23046875, + "rewards/tag_count_reward/std": 0.23538535833358765, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1431.274658203125, + "completions/mean_terminated_length": 947.2310791015625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.05519151883224122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10516389543313849, + "kl": 0.00211334228515625, + "learning_rate": 5.48936170212766e-07, + "loss": 0.1272, + "num_tokens": 186416982.0, + "reward": 0.5876116156578064, + "reward_std": 0.4079829156398773, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2126116007566452, + "rewards/tag_count_reward/std": 0.22215762734413147, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1346.0670166015625, + "completions/mean_terminated_length": 920.8817138671875, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.05540461349954717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1492683471879114, + "kl": 0.0026035308837890625, + "learning_rate": 5.51063829787234e-07, + "loss": 0.0796, + "num_tokens": 187090148.0, + "reward": 0.6467634439468384, + "reward_std": 0.3375851809978485, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2449776828289032, + "rewards/tag_count_reward/std": 0.2384992092847824, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1581.982177734375, + "completions/mean_terminated_length": 1029.5804443359375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.05561770816685312, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09441529676649732, + "kl": 0.0018463134765625, + "learning_rate": 5.531914893617021e-07, + "loss": 0.0808, + "num_tokens": 187870380.0, + "reward": 0.5167410969734192, + "reward_std": 0.4009559750556946, + "rewards/accuracy_reward/mean": 0.3370535671710968, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1796875, + "rewards/tag_count_reward/std": 0.19876663386821747, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1360.243408203125, + "completions/mean_terminated_length": 955.3936157226562, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.055830802834159074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12006458425020748, + "kl": 0.0020923614501953125, + "learning_rate": 5.553191489361701e-07, + "loss": 0.1594, + "num_tokens": 188546233.0, + "reward": 0.6651785969734192, + "reward_std": 0.4914795458316803, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2388392835855484, + "rewards/tag_count_reward/std": 0.2432268112897873, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1402.8951416015625, + "completions/mean_terminated_length": 940.6934814453125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.056043897501465025, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10158763031076198, + "kl": 0.002044677734375, + "learning_rate": 5.574468085106383e-07, + "loss": 0.1016, + "num_tokens": 189247962.0, + "reward": 0.6863839626312256, + "reward_std": 0.4087267220020294, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2421875, + "rewards/tag_count_reward/std": 0.23695333302021027, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1272.4129638671875, + "completions/mean_terminated_length": 833.0944213867188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.056256992168770975, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11167561631369617, + "kl": 0.002597808837890625, + "learning_rate": 5.595744680851063e-07, + "loss": 0.1561, + "num_tokens": 189883235.0, + "reward": 0.6328125, + "reward_std": 0.4506607949733734, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2421875, + "rewards/tag_count_reward/std": 0.23813055455684662, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1477.0535888671875, + "completions/mean_terminated_length": 950.214599609375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.056470086836076926, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.09737646782863565, + "kl": 0.002094268798828125, + "learning_rate": 5.617021276595744e-07, + "loss": 0.0892, + "num_tokens": 190619819.0, + "reward": 0.5580357313156128, + "reward_std": 0.3536396324634552, + "rewards/accuracy_reward/mean": 0.3325892984867096, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2254464328289032, + "rewards/tag_count_reward/std": 0.24681374430656433, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1355.122802734375, + "completions/mean_terminated_length": 894.0631713867188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.056683181503382876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11131382201245558, + "kl": 0.002262115478515625, + "learning_rate": 5.638297872340425e-07, + "loss": 0.1134, + "num_tokens": 191298946.0, + "reward": 0.6830357313156128, + "reward_std": 0.42979133129119873, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2522321343421936, + "rewards/tag_count_reward/std": 0.24232175946235657, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1306.904052734375, + "completions/mean_terminated_length": 799.83837890625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.05689627617068883, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16666765437193687, + "kl": 0.002765655517578125, + "learning_rate": 5.659574468085107e-07, + "loss": 0.1017, + "num_tokens": 191952439.0, + "reward": 0.6556919813156128, + "reward_std": 0.40206488966941833, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2717633843421936, + "rewards/tag_count_reward/std": 0.26057201623916626, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1388.9576416015625, + "completions/mean_terminated_length": 894.67578125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.05710937083799478, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1011795657906066, + "kl": 0.00235748291015625, + "learning_rate": 5.680851063829787e-07, + "loss": 0.1263, + "num_tokens": 192640932.0, + "reward": 0.6322544813156128, + "reward_std": 0.3782602846622467, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2371651828289032, + "rewards/tag_count_reward/std": 0.2502289414405823, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1309.852783203125, + "completions/mean_terminated_length": 862.731201171875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.05732246550530073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14259109791603855, + "kl": 0.00262451171875, + "learning_rate": 5.702127659574469e-07, + "loss": 0.147, + "num_tokens": 193294466.0, + "reward": 0.6439732313156128, + "reward_std": 0.4190162122249603, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2667410671710968, + "rewards/tag_count_reward/std": 0.2670345902442932, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1332.7098388671875, + "completions/mean_terminated_length": 895.3021850585938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.05753556017260668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15700509761295714, + "kl": 0.00263214111328125, + "learning_rate": 5.723404255319149e-07, + "loss": 0.1043, + "num_tokens": 193962272.0, + "reward": 0.6707589626312256, + "reward_std": 0.40346354246139526, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2600446343421936, + "rewards/tag_count_reward/std": 0.252857506275177, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1463.8282470703125, + "completions/mean_terminated_length": 924.785400390625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.05774865483991263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09990787000231265, + "kl": 0.002315521240234375, + "learning_rate": 5.74468085106383e-07, + "loss": 0.0854, + "num_tokens": 194690915.0, + "reward": 0.6473214626312256, + "reward_std": 0.4259307086467743, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2321428507566452, + "rewards/tag_count_reward/std": 0.2507578730583191, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1163.171875, + "completions/mean_terminated_length": 832.0398559570312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05796174950721858, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.3606952081114682, + "kl": 0.00356292724609375, + "learning_rate": 5.76595744680851e-07, + "loss": 0.0752, + "num_tokens": 195278576.0, + "reward": 0.8822544813156128, + "reward_std": 0.4029451310634613, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2862723171710968, + "rewards/tag_count_reward/std": 0.2632318139076233, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1444.0491943359375, + "completions/mean_terminated_length": 934.5431518554688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.05817484417452453, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10636257933505665, + "kl": 0.002384185791015625, + "learning_rate": 5.787234042553191e-07, + "loss": 0.1347, + "num_tokens": 195992230.0, + "reward": 0.6953125596046448, + "reward_std": 0.46234703063964844, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2667410671710968, + "rewards/tag_count_reward/std": 0.25906145572662354, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1375.013427734375, + "completions/mean_terminated_length": 990.1123046875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.05838793884183048, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11160007530533718, + "kl": 0.002391815185546875, + "learning_rate": 5.808510638297872e-07, + "loss": 0.0999, + "num_tokens": 196681724.0, + "reward": 0.6914063096046448, + "reward_std": 0.38176435232162476, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2561383843421936, + "rewards/tag_count_reward/std": 0.23788601160049438, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1268.8638916015625, + "completions/mean_terminated_length": 914.7110595703125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.05860103350913643, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11314332249872504, + "kl": 0.00270843505859375, + "learning_rate": 5.829787234042552e-07, + "loss": 0.1064, + "num_tokens": 197316591.0, + "reward": 0.7561384439468384, + "reward_std": 0.38534533977508545, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2717633843421936, + "rewards/tag_count_reward/std": 0.24508734047412872, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1347.029052734375, + "completions/mean_terminated_length": 942.242919921875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.05881412817644238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11322320281886235, + "kl": 0.00263214111328125, + "learning_rate": 5.851063829787234e-07, + "loss": 0.1232, + "num_tokens": 197984636.0, + "reward": 0.8565848469734192, + "reward_std": 0.45551785826683044, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2896205484867096, + "rewards/tag_count_reward/std": 0.2562822997570038, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1441.3013916015625, + "completions/mean_terminated_length": 910.75732421875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.05902722284374833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0981790236901551, + "kl": 0.002227783203125, + "learning_rate": 5.872340425531914e-07, + "loss": 0.1128, + "num_tokens": 198705715.0, + "reward": 0.60546875, + "reward_std": 0.4871624708175659, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2282366007566452, + "rewards/tag_count_reward/std": 0.24565714597702026, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1352.680908203125, + "completions/mean_terminated_length": 977.54296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.05924031751105428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12201504936080408, + "kl": 0.002719879150390625, + "learning_rate": 5.893617021276595e-07, + "loss": 0.0999, + "num_tokens": 199381140.0, + "reward": 0.640625, + "reward_std": 0.3989824056625366, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2790178656578064, + "rewards/tag_count_reward/std": 0.24461889266967773, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1402.9420166015625, + "completions/mean_terminated_length": 910.2598266601562, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.05945341217836023, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.934626448161137, + "kl": 0.00521087646484375, + "learning_rate": 5.914893617021275e-07, + "loss": 0.0746, + "num_tokens": 200084890.0, + "reward": 0.58984375, + "reward_std": 0.3934418261051178, + "rewards/accuracy_reward/mean": 0.3147321343421936, + "rewards/accuracy_reward/std": 0.4649282991886139, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2751116156578064, + "rewards/tag_count_reward/std": 0.27743226289749146, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1368.24560546875, + "completions/mean_terminated_length": 885.6717529296875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.059666506845666184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.107300934420347, + "kl": 0.00250244140625, + "learning_rate": 5.936170212765958e-07, + "loss": 0.1175, + "num_tokens": 200772744.0, + "reward": 0.6395089626312256, + "reward_std": 0.4106302857398987, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2511160671710968, + "rewards/tag_count_reward/std": 0.2680797576904297, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1372.4554443359375, + "completions/mean_terminated_length": 837.4240112304688, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.059879601512972135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09809112278554397, + "kl": 0.002422332763671875, + "learning_rate": 5.957446808510638e-07, + "loss": 0.1306, + "num_tokens": 201465812.0, + "reward": 0.640625, + "reward_std": 0.447689950466156, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2745535671710968, + "rewards/tag_count_reward/std": 0.27723026275634766, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1347.618408203125, + "completions/mean_terminated_length": 907.0145263671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.060092696180278085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12191400328705543, + "kl": 0.002895355224609375, + "learning_rate": 5.978723404255319e-07, + "loss": 0.0979, + "num_tokens": 202137801.0, + "reward": 0.7020089626312256, + "reward_std": 0.42347556352615356, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2912946343421936, + "rewards/tag_count_reward/std": 0.2643469274044037, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1412.4107666015625, + "completions/mean_terminated_length": 913.561767578125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.06030579084758404, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10329346225576208, + "kl": 0.002796173095703125, + "learning_rate": 6e-07, + "loss": 0.1379, + "num_tokens": 202844657.0, + "reward": 0.6205357313156128, + "reward_std": 0.41759294271469116, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.2800706923007965, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1258.607177734375, + "completions/mean_terminated_length": 841.0101928710938, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.06051888551488999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.117627392301162, + "kl": 0.003108978271484375, + "learning_rate": 6.021276595744681e-07, + "loss": 0.1359, + "num_tokens": 203475953.0, + "reward": 0.7706473469734192, + "reward_std": 0.4639025330543518, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3197544515132904, + "rewards/tag_count_reward/std": 0.2838204801082611, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1422.8148193359375, + "completions/mean_terminated_length": 953.92578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.060731980182195944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10814723346279091, + "kl": 0.002681732177734375, + "learning_rate": 6.042553191489361e-07, + "loss": 0.1101, + "num_tokens": 204181134.0, + "reward": 0.65234375, + "reward_std": 0.42916712164878845, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2438616007566452, + "rewards/tag_count_reward/std": 0.2651267945766449, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1309.6317138671875, + "completions/mean_terminated_length": 870.81494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.060945074849501894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1961563161108591, + "kl": 0.003780364990234375, + "learning_rate": 6.063829787234043e-07, + "loss": 0.1511, + "num_tokens": 204834713.0, + "reward": 0.7885044813156128, + "reward_std": 0.42053401470184326, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3510044515132904, + "rewards/tag_count_reward/std": 0.2876738905906677, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1340.3125, + "completions/mean_terminated_length": 851.6075439453125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.061158169516807845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1041817157682442, + "kl": 0.003040313720703125, + "learning_rate": 6.085106382978723e-07, + "loss": 0.13, + "num_tokens": 205505381.0, + "reward": 0.6992188096046448, + "reward_std": 0.42712050676345825, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2795758843421936, + "rewards/tag_count_reward/std": 0.2769909203052521, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1413.5379638671875, + "completions/mean_terminated_length": 812.17822265625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.061371264184113795, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10002502646311803, + "kl": 0.002445220947265625, + "learning_rate": 6.106382978723404e-07, + "loss": 0.1126, + "num_tokens": 206204630.0, + "reward": 0.5719866156578064, + "reward_std": 0.41618457436561584, + "rewards/accuracy_reward/mean": 0.3102678656578064, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.26171875, + "rewards/tag_count_reward/std": 0.28131988644599915, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1336.419677734375, + "completions/mean_terminated_length": 880.2783813476562, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.061584358851419746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11438909649491394, + "kl": 0.00281524658203125, + "learning_rate": 6.127659574468084e-07, + "loss": 0.1182, + "num_tokens": 206866866.0, + "reward": 0.6830357313156128, + "reward_std": 0.46057838201522827, + "rewards/accuracy_reward/mean": 0.4027777910232544, + "rewards/accuracy_reward/std": 0.4910254180431366, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2946428656578064, + "rewards/tag_count_reward/std": 0.27874815464019775, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1465.3817138671875, + "completions/mean_terminated_length": 903.2061767578125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0617974535187257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10553625637377327, + "kl": 0.00267791748046875, + "learning_rate": 6.148936170212766e-07, + "loss": 0.1264, + "num_tokens": 207586205.0, + "reward": 0.6049107313156128, + "reward_std": 0.4599440097808838, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2700892984867096, + "rewards/tag_count_reward/std": 0.27859586477279663, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1385.8751220703125, + "completions/mean_terminated_length": 915.8167724609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.06201054818603165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10441304913743134, + "kl": 0.002758026123046875, + "learning_rate": 6.170212765957446e-07, + "loss": 0.158, + "num_tokens": 208275893.0, + "reward": 0.6283482313156128, + "reward_std": 0.4457703232765198, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378541469574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2756696343421936, + "rewards/tag_count_reward/std": 0.2806384563446045, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1284.3348388671875, + "completions/mean_terminated_length": 752.0833740234375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.0622236428533376, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.16842743455439116, + "kl": 0.003795623779296875, + "learning_rate": 6.191489361702127e-07, + "loss": 0.1495, + "num_tokens": 208921803.0, + "reward": 0.656808078289032, + "reward_std": 0.39747610688209534, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378541469574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3041294515132904, + "rewards/tag_count_reward/std": 0.2949039936065674, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1291.9285888671875, + "completions/mean_terminated_length": 825.18408203125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06243673752064355, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11133438945561518, + "kl": 0.003185272216796875, + "learning_rate": 6.212765957446809e-07, + "loss": 0.1226, + "num_tokens": 209570507.0, + "reward": 0.7243303656578064, + "reward_std": 0.3983565866947174, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3091517984867096, + "rewards/tag_count_reward/std": 0.29701149463653564, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1295.1585693359375, + "completions/mean_terminated_length": 892.9554443359375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06264983218794949, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1263571544603017, + "kl": 0.003376007080078125, + "learning_rate": 6.234042553191489e-07, + "loss": 0.1032, + "num_tokens": 210227506.0, + "reward": 0.7784598469734192, + "reward_std": 0.5312626361846924, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.30078125, + "rewards/tag_count_reward/std": 0.28783005475997925, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1268.5491943359375, + "completions/mean_terminated_length": 880.1270751953125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06286292685525545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10021198258450638, + "kl": 0.0032501220703125, + "learning_rate": 6.25531914893617e-07, + "loss": 0.1431, + "num_tokens": 210859768.0, + "reward": 0.8158482313156128, + "reward_std": 0.47183746099472046, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3024553656578064, + "rewards/tag_count_reward/std": 0.2743399441242218, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1364.243408203125, + "completions/mean_terminated_length": 905.0037231445312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.06307602152256139, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10904644072605166, + "kl": 0.003192901611328125, + "learning_rate": 6.276595744680851e-07, + "loss": 0.1306, + "num_tokens": 211544629.0, + "reward": 0.6785714626312256, + "reward_std": 0.38587620854377747, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2991071343421936, + "rewards/tag_count_reward/std": 0.27291879057884216, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1412.4598388671875, + "completions/mean_terminated_length": 913.6494140625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.06328911618986735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1114235807426125, + "kl": 0.0030364990234375, + "learning_rate": 6.297872340425532e-07, + "loss": 0.1057, + "num_tokens": 212249091.0, + "reward": 0.566964328289032, + "reward_std": 0.41129270195961, + "rewards/accuracy_reward/mean": 0.3058035671710968, + "rewards/accuracy_reward/std": 0.4612620174884796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2611607015132904, + "rewards/tag_count_reward/std": 0.28553569316864014, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1212.2857666015625, + "completions/mean_terminated_length": 851.8338623046875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0635022108571733, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12355584922860169, + "kl": 0.004459381103515625, + "learning_rate": 6.319148936170212e-07, + "loss": 0.0972, + "num_tokens": 212855411.0, + "reward": 0.848214328289032, + "reward_std": 0.46111035346984863, + "rewards/accuracy_reward/mean": 0.48842594027519226, + "rewards/accuracy_reward/std": 0.500445544719696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3772321343421936, + "rewards/tag_count_reward/std": 0.3014608919620514, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1363.04248046875, + "completions/mean_terminated_length": 898.7078857421875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06371530552447925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6525170299487657, + "kl": 0.00958251953125, + "learning_rate": 6.340425531914893e-07, + "loss": 0.1403, + "num_tokens": 213544838.0, + "reward": 0.6640625, + "reward_std": 0.43487489223480225, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3046875, + "rewards/tag_count_reward/std": 0.28294217586517334, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1379.247802734375, + "completions/mean_terminated_length": 1018.4432983398438, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.0639284001917852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10703874401396833, + "kl": 0.0035552978515625, + "learning_rate": 6.361702127659574e-07, + "loss": 0.1367, + "num_tokens": 214227429.0, + "reward": 0.8532366156578064, + "reward_std": 0.49103841185569763, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3577008843421936, + "rewards/tag_count_reward/std": 0.2929672598838806, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1410.7344970703125, + "completions/mean_terminated_length": 887.451171875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.06414149485909115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1057458392485803, + "kl": 0.003360748291015625, + "learning_rate": 6.382978723404255e-07, + "loss": 0.0901, + "num_tokens": 214935198.0, + "reward": 0.5853794813156128, + "reward_std": 0.3734409213066101, + "rewards/accuracy_reward/mean": 0.2991071343421936, + "rewards/accuracy_reward/std": 0.45837870240211487, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2862723171710968, + "rewards/tag_count_reward/std": 0.28905490040779114, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1351.77685546875, + "completions/mean_terminated_length": 913.7891235351562, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0643545895263971, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11870056565425288, + "kl": 0.004306793212890625, + "learning_rate": 6.404255319148935e-07, + "loss": 0.1716, + "num_tokens": 215610442.0, + "reward": 0.7053571939468384, + "reward_std": 0.38001716136932373, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3370535671710968, + "rewards/tag_count_reward/std": 0.30508336424827576, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1357.837158203125, + "completions/mean_terminated_length": 835.4784545898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.06456768419370305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1129373616142468, + "kl": 0.0040740966796875, + "learning_rate": 6.425531914893617e-07, + "loss": 0.1299, + "num_tokens": 216290385.0, + "reward": 0.7142857313156128, + "reward_std": 0.4183560907840729, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3191964328289032, + "rewards/tag_count_reward/std": 0.3037053346633911, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1275.05810546875, + "completions/mean_terminated_length": 820.0637817382812, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.064780778861009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11391010831281453, + "kl": 0.003643035888671875, + "learning_rate": 6.446808510638297e-07, + "loss": 0.121, + "num_tokens": 216933899.0, + "reward": 0.7042410969734192, + "reward_std": 0.47441548109054565, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378541469574, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3515625, + "rewards/tag_count_reward/std": 0.29821956157684326, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1204.015625, + "completions/mean_terminated_length": 820.3863525390625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.06499387352831495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12531485754865115, + "kl": 0.004322052001953125, + "learning_rate": 6.468085106382979e-07, + "loss": 0.143, + "num_tokens": 217539154.0, + "reward": 0.8158482313156128, + "reward_std": 0.465718150138855, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3515625, + "rewards/tag_count_reward/std": 0.29206088185310364, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1302.4910888671875, + "completions/mean_terminated_length": 829.065673828125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.0652069681956209, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11016940336135449, + "kl": 0.003643035888671875, + "learning_rate": 6.48936170212766e-07, + "loss": 0.1791, + "num_tokens": 218187134.0, + "reward": 0.785714328289032, + "reward_std": 0.46417149901390076, + "rewards/accuracy_reward/mean": 0.4652777910232544, + "rewards/accuracy_reward/std": 0.499371200799942, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3370535671710968, + "rewards/tag_count_reward/std": 0.30324462056159973, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1309.009033203125, + "completions/mean_terminated_length": 852.8086547851562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06542006286292686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11202130060878548, + "kl": 0.0044708251953125, + "learning_rate": 6.510638297872341e-07, + "loss": 0.1238, + "num_tokens": 218844226.0, + "reward": 0.6718750596046448, + "reward_std": 0.4207690954208374, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3258928656578064, + "rewards/tag_count_reward/std": 0.3039436638355255, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1346.743408203125, + "completions/mean_terminated_length": 888.7269287109375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.0656331575302328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11314957326779536, + "kl": 0.0044097900390625, + "learning_rate": 6.531914893617021e-07, + "loss": 0.1227, + "num_tokens": 219511519.0, + "reward": 0.8426339626312256, + "reward_std": 0.4310063421726227, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3694196343421936, + "rewards/tag_count_reward/std": 0.3046472668647766, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1244.0201416015625, + "completions/mean_terminated_length": 851.3787231445312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06584625219753876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13730477351699344, + "kl": 0.00449371337890625, + "learning_rate": 6.553191489361701e-07, + "loss": 0.1094, + "num_tokens": 220135016.0, + "reward": 0.7511160969734192, + "reward_std": 0.4742560386657715, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3515625, + "rewards/tag_count_reward/std": 0.30608630180358887, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1184.107177734375, + "completions/mean_terminated_length": 853.4815063476562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0660593468648447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10185161511489055, + "kl": 0.004428863525390625, + "learning_rate": 6.574468085106383e-07, + "loss": 0.1334, + "num_tokens": 220732424.0, + "reward": 0.7907366156578064, + "reward_std": 0.42039892077445984, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.38671875, + "rewards/tag_count_reward/std": 0.3076702952384949, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1161.015625, + "completions/mean_terminated_length": 778.450439453125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.06627244153215066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1387803492721846, + "kl": 0.0057373046875, + "learning_rate": 6.595744680851063e-07, + "loss": 0.1336, + "num_tokens": 221323167.0, + "reward": 0.8320313096046448, + "reward_std": 0.4504760205745697, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3610491156578064, + "rewards/tag_count_reward/std": 0.30803120136260986, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1379.0491943359375, + "completions/mean_terminated_length": 872.7451782226562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.0664855361994566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18000329121600528, + "kl": 0.003940582275390625, + "learning_rate": 6.617021276595744e-07, + "loss": 0.1474, + "num_tokens": 222012949.0, + "reward": 0.7159598469734192, + "reward_std": 0.44996094703674316, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3364955484867096, + "rewards/tag_count_reward/std": 0.3036349415779114, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1310.2835693359375, + "completions/mean_terminated_length": 771.9497680664062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06669863086676256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11125159813996138, + "kl": 0.004058837890625, + "learning_rate": 6.638297872340425e-07, + "loss": 0.1663, + "num_tokens": 222672740.0, + "reward": 0.7589285969734192, + "reward_std": 0.44345101714134216, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3459821343421936, + "rewards/tag_count_reward/std": 0.31149765849113464, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1458.26123046875, + "completions/mean_terminated_length": 928.4957885742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.06691172553406852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10535689333378054, + "kl": 0.00356292724609375, + "learning_rate": 6.659574468085106e-07, + "loss": 0.1258, + "num_tokens": 223403561.0, + "reward": 0.7377232313156128, + "reward_std": 0.48062777519226074, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3247767984867096, + "rewards/tag_count_reward/std": 0.29345956444740295, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1304.10498046875, + "completions/mean_terminated_length": 836.1272583007812, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.06712482020137446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12201852959040368, + "kl": 0.004032135009765625, + "learning_rate": 6.680851063829786e-07, + "loss": 0.1186, + "num_tokens": 224066168.0, + "reward": 0.7672991156578064, + "reward_std": 0.4223143756389618, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3409598171710968, + "rewards/tag_count_reward/std": 0.3050869405269623, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1344.8170166015625, + "completions/mean_terminated_length": 863.6917724609375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.06733791486868042, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11422033281040551, + "kl": 0.00428009033203125, + "learning_rate": 6.702127659574468e-07, + "loss": 0.0915, + "num_tokens": 224731766.0, + "reward": 0.6986607313156128, + "reward_std": 0.4488019347190857, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399619221687317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.359375, + "rewards/tag_count_reward/std": 0.3190997540950775, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1215.1741943359375, + "completions/mean_terminated_length": 852.1474609375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.06755100953598636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1413236481182424, + "kl": 0.005096435546875, + "learning_rate": 6.723404255319149e-07, + "loss": 0.1195, + "num_tokens": 225343348.0, + "reward": 0.8035714626312256, + "reward_std": 0.4650380611419678, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3683035671710968, + "rewards/tag_count_reward/std": 0.2881123721599579, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1364.2723388671875, + "completions/mean_terminated_length": 954.0357055664062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06776410420329232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3059060722790863, + "kl": 0.00421905517578125, + "learning_rate": 6.74468085106383e-07, + "loss": 0.1343, + "num_tokens": 226032782.0, + "reward": 0.7064732313156128, + "reward_std": 0.4493306875228882, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3604910671710968, + "rewards/tag_count_reward/std": 0.2959672808647156, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1375.8817138671875, + "completions/mean_terminated_length": 876.36962890625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.06797719887059826, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1126971288726166, + "kl": 0.004131317138671875, + "learning_rate": 6.76595744680851e-07, + "loss": 0.1509, + "num_tokens": 226724473.0, + "reward": 0.754464328289032, + "reward_std": 0.4341088831424713, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3169642984867096, + "rewards/tag_count_reward/std": 0.30328577756881714, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1377.9688720703125, + "completions/mean_terminated_length": 897.9080200195312, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.06819029353790422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1047019472189945, + "kl": 0.003772735595703125, + "learning_rate": 6.787234042553192e-07, + "loss": 0.1354, + "num_tokens": 227418587.0, + "reward": 0.6869419813156128, + "reward_std": 0.43988627195358276, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211319923401, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.33203125, + "rewards/tag_count_reward/std": 0.3067030608654022, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1506.3973388671875, + "completions/mean_terminated_length": 940.0639038085938, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.06840338820521016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09759000097938918, + "kl": 0.0033111572265625, + "learning_rate": 6.808510638297872e-07, + "loss": 0.1158, + "num_tokens": 228164109.0, + "reward": 0.5524553656578064, + "reward_std": 0.4266922175884247, + "rewards/accuracy_reward/mean": 0.2566964328289032, + "rewards/accuracy_reward/std": 0.4372987747192383, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2957589328289032, + "rewards/tag_count_reward/std": 0.30857232213020325, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1134.0670166015625, + "completions/mean_terminated_length": 731.4662475585938, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.06861648287251612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.160229197618124, + "kl": 0.004913330078125, + "learning_rate": 6.829787234042553e-07, + "loss": 0.1532, + "num_tokens": 228737675.0, + "reward": 0.8476563096046448, + "reward_std": 0.4259325861930847, + "rewards/accuracy_reward/mean": 0.44907405972480774, + "rewards/accuracy_reward/std": 0.49797651171684265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4146205484867096, + "rewards/tag_count_reward/std": 0.30624276399612427, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1250.727783203125, + "completions/mean_terminated_length": 794.743896484375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.06882957753982206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11077532234015854, + "kl": 0.0046539306640625, + "learning_rate": 6.851063829787234e-07, + "loss": 0.124, + "num_tokens": 229365537.0, + "reward": 0.8275669813156128, + "reward_std": 0.4649622440338135, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3699776828289032, + "rewards/tag_count_reward/std": 0.3176877200603485, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1336.515625, + "completions/mean_terminated_length": 876.1433715820312, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.06904267220712802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11537342606505271, + "kl": 0.004520416259765625, + "learning_rate": 6.872340425531915e-07, + "loss": 0.1765, + "num_tokens": 230043624.0, + "reward": 0.7064732313156128, + "reward_std": 0.43799006938934326, + "rewards/accuracy_reward/mean": 0.3482142984867096, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3582589328289032, + "rewards/tag_count_reward/std": 0.31150367856025696, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1431.4129638671875, + "completions/mean_terminated_length": 964.7412109375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.06925576687443397, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13168027684807662, + "kl": 0.00611114501953125, + "learning_rate": 6.893617021276595e-07, + "loss": 0.129, + "num_tokens": 230755073.0, + "reward": 0.7539063096046448, + "reward_std": 0.43856027722358704, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3454241156578064, + "rewards/tag_count_reward/std": 0.31009531021118164, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1439.8170166015625, + "completions/mean_terminated_length": 962.4780883789062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06946886154173992, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10476827077645101, + "kl": 0.0042266845703125, + "learning_rate": 6.914893617021277e-07, + "loss": 0.0941, + "num_tokens": 231470927.0, + "reward": 0.7511160969734192, + "reward_std": 0.4309828281402588, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3225446343421936, + "rewards/tag_count_reward/std": 0.31513744592666626, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1035.122802734375, + "completions/mean_terminated_length": 847.5528564453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.06968195620904587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12200688034336493, + "kl": 0.00629425048828125, + "learning_rate": 6.936170212765957e-07, + "loss": 0.0707, + "num_tokens": 231996902.0, + "reward": 1.109375, + "reward_std": 0.4636426270008087, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4620535671710968, + "rewards/tag_count_reward/std": 0.2890813648700714, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1353.83935546875, + "completions/mean_terminated_length": 896.2073974609375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06989505087635182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11730913789115854, + "kl": 0.00429534912109375, + "learning_rate": 6.957446808510637e-07, + "loss": 0.1252, + "num_tokens": 232674766.0, + "reward": 0.7578125596046448, + "reward_std": 0.48383283615112305, + "rewards/accuracy_reward/mean": 0.39814814925193787, + "rewards/accuracy_reward/std": 0.49008384346961975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3738839328289032, + "rewards/tag_count_reward/std": 0.28964006900787354, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1484.46435546875, + "completions/mean_terminated_length": 978.2373046875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.07010814554365777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1083797455369247, + "kl": 0.00372314453125, + "learning_rate": 6.978723404255319e-07, + "loss": 0.1588, + "num_tokens": 233411614.0, + "reward": 0.617745578289032, + "reward_std": 0.4261285066604614, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2829241156578064, + "rewards/tag_count_reward/std": 0.29614174365997314, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1264.72998046875, + "completions/mean_terminated_length": 882.20263671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07032124021096373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11064147007348499, + "kl": 0.004802703857421875, + "learning_rate": 7e-07, + "loss": 0.1706, + "num_tokens": 234046517.0, + "reward": 0.8766741752624512, + "reward_std": 0.4159274399280548, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3900669515132904, + "rewards/tag_count_reward/std": 0.3084321916103363, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1418.33935546875, + "completions/mean_terminated_length": 1014.7106323242188, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.07053433487826967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10604201192818602, + "kl": 0.0043182373046875, + "learning_rate": 7.021276595744681e-07, + "loss": 0.099, + "num_tokens": 234755725.0, + "reward": 0.805245578289032, + "reward_std": 0.4905674457550049, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3699776828289032, + "rewards/tag_count_reward/std": 0.31724730134010315, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1340.950927734375, + "completions/mean_terminated_length": 985.0537109375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.07074742954557563, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11485927129809907, + "kl": 0.00485992431640625, + "learning_rate": 7.042553191489361e-07, + "loss": 0.1018, + "num_tokens": 235428631.0, + "reward": 0.8671875596046448, + "reward_std": 0.4256546199321747, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3627232015132904, + "rewards/tag_count_reward/std": 0.2884133756160736, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1325.109375, + "completions/mean_terminated_length": 878.848388671875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.07096052421288157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11310105761655365, + "kl": 0.004547119140625, + "learning_rate": 7.063829787234043e-07, + "loss": 0.1112, + "num_tokens": 236092808.0, + "reward": 0.8392857313156128, + "reward_std": 0.4639196991920471, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3973214328289032, + "rewards/tag_count_reward/std": 0.30616989731788635, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1354.6429443359375, + "completions/mean_terminated_length": 880.2406005859375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.07117361888018753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1046794921701192, + "kl": 0.0046844482421875, + "learning_rate": 7.085106382978723e-07, + "loss": 0.1118, + "num_tokens": 236762296.0, + "reward": 0.8900669813156128, + "reward_std": 0.4585322141647339, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3722098171710968, + "rewards/tag_count_reward/std": 0.31372949481010437, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1209.38623046875, + "completions/mean_terminated_length": 816.2000122070312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.07138671354749347, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.343263602137767, + "kl": 0.1538238525390625, + "learning_rate": 7.106382978723404e-07, + "loss": 0.134, + "num_tokens": 237373509.0, + "reward": 0.9453125596046448, + "reward_std": 0.4081028401851654, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4229910671710968, + "rewards/tag_count_reward/std": 0.3100414574146271, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1225.453125, + "completions/mean_terminated_length": 859.287109375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.07159980821479943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1169139833377291, + "kl": 0.0057525634765625, + "learning_rate": 7.127659574468084e-07, + "loss": 0.128, + "num_tokens": 237993472.0, + "reward": 0.9508929252624512, + "reward_std": 0.462897926568985, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4151785671710968, + "rewards/tag_count_reward/std": 0.30753690004348755, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1325.4375, + "completions/mean_terminated_length": 883.582763671875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.07181290288210537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30890027858823593, + "kl": 0.00490570068359375, + "learning_rate": 7.148936170212766e-07, + "loss": 0.1597, + "num_tokens": 238660212.0, + "reward": 0.7578125596046448, + "reward_std": 0.4775422215461731, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3761160671710968, + "rewards/tag_count_reward/std": 0.32854312658309937, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1347.7232666015625, + "completions/mean_terminated_length": 868.5864868164062, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.07202599754941133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10666511953622149, + "kl": 0.00457000732421875, + "learning_rate": 7.170212765957446e-07, + "loss": 0.172, + "num_tokens": 239334696.0, + "reward": 0.7382813096046448, + "reward_std": 0.45309746265411377, + "rewards/accuracy_reward/mean": 0.3888888955116272, + "rewards/accuracy_reward/std": 0.4880632162094116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.36328125, + "rewards/tag_count_reward/std": 0.3144131898880005, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1290.578125, + "completions/mean_terminated_length": 942.706787109375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.07223909221671727, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12307020779859715, + "kl": 0.0059967041015625, + "learning_rate": 7.191489361702127e-07, + "loss": 0.087, + "num_tokens": 239988731.0, + "reward": 0.9162946939468384, + "reward_std": 0.4513002038002014, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4229910671710968, + "rewards/tag_count_reward/std": 0.31318238377571106, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1069.9910888671875, + "completions/mean_terminated_length": 755.5280151367188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.07245218688402323, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1365874482826662, + "kl": 0.00689697265625, + "learning_rate": 7.212765957446808e-07, + "loss": 0.1578, + "num_tokens": 240532103.0, + "reward": 0.957589328289032, + "reward_std": 0.4438931941986084, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4642857015132904, + "rewards/tag_count_reward/std": 0.30305516719818115, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1140.0179443359375, + "completions/mean_terminated_length": 764.7949829101562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07266528155132917, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11663750926784484, + "kl": 0.0072021484375, + "learning_rate": 7.23404255319149e-07, + "loss": 0.0926, + "num_tokens": 241106927.0, + "reward": 0.9190848469734192, + "reward_std": 0.3914845883846283, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4391741156578064, + "rewards/tag_count_reward/std": 0.32864710688591003, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1244.1273193359375, + "completions/mean_terminated_length": 867.2294921875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.07287837621863513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37253260087848583, + "kl": 0.017974853515625, + "learning_rate": 7.25531914893617e-07, + "loss": 0.1367, + "num_tokens": 241739512.0, + "reward": 0.9285714626312256, + "reward_std": 0.45305758714675903, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4464285671710968, + "rewards/tag_count_reward/std": 0.32110437750816345, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1344.7254638671875, + "completions/mean_terminated_length": 934.6890258789062, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.07309147088594108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10728719073805787, + "kl": 0.005767822265625, + "learning_rate": 7.276595744680852e-07, + "loss": 0.145, + "num_tokens": 242417309.0, + "reward": 0.9626116752624512, + "reward_std": 0.5431643724441528, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4402901828289032, + "rewards/tag_count_reward/std": 0.3219774663448334, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1311.9285888671875, + "completions/mean_terminated_length": 812.94384765625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.07330456555324703, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12286170755126098, + "kl": 0.00467681884765625, + "learning_rate": 7.297872340425532e-07, + "loss": 0.1576, + "num_tokens": 243083789.0, + "reward": 0.727120578289032, + "reward_std": 0.5145233273506165, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3722098171710968, + "rewards/tag_count_reward/std": 0.3150636851787567, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1346.9129638671875, + "completions/mean_terminated_length": 942.059814453125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.07351766022055298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1545889246069127, + "kl": 0.0072784423828125, + "learning_rate": 7.319148936170212e-07, + "loss": 0.1314, + "num_tokens": 243755302.0, + "reward": 0.8649554252624512, + "reward_std": 0.4807323217391968, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4073660671710968, + "rewards/tag_count_reward/std": 0.3320356011390686, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1288.618408203125, + "completions/mean_terminated_length": 858.47900390625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.07373075488785893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11171664756426589, + "kl": 0.00605010986328125, + "learning_rate": 7.340425531914893e-07, + "loss": 0.1661, + "num_tokens": 244403915.0, + "reward": 0.9246652126312256, + "reward_std": 0.47537100315093994, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.43359375, + "rewards/tag_count_reward/std": 0.32283321022987366, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1325.337158203125, + "completions/mean_terminated_length": 879.2166137695312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.07394384955516488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10749304991399612, + "kl": 0.00551605224609375, + "learning_rate": 7.361702127659574e-07, + "loss": 0.1634, + "num_tokens": 245063954.0, + "reward": 0.7678571939468384, + "reward_std": 0.5014542937278748, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4129464328289032, + "rewards/tag_count_reward/std": 0.3314428925514221, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1174.3616943359375, + "completions/mean_terminated_length": 843.7230834960938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.07415694422247084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12533643317460383, + "kl": 0.0067596435546875, + "learning_rate": 7.382978723404255e-07, + "loss": 0.1334, + "num_tokens": 245655924.0, + "reward": 1.0150669813156128, + "reward_std": 0.5282325148582458, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4614955484867096, + "rewards/tag_count_reward/std": 0.3182491958141327, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1274.591552734375, + "completions/mean_terminated_length": 861.4006958007812, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.07437003888977678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1113577187464603, + "kl": 0.00650787353515625, + "learning_rate": 7.404255319148935e-07, + "loss": 0.1266, + "num_tokens": 246292941.0, + "reward": 0.9514509439468384, + "reward_std": 0.4593999981880188, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4291294515132904, + "rewards/tag_count_reward/std": 0.321010559797287, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1351.509033203125, + "completions/mean_terminated_length": 933.6143188476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.07458313355708274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35591362041197494, + "kl": 0.00732421875, + "learning_rate": 7.425531914893617e-07, + "loss": 0.1225, + "num_tokens": 246978577.0, + "reward": 0.8900669813156128, + "reward_std": 0.5231499075889587, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3967633843421936, + "rewards/tag_count_reward/std": 0.32005247473716736, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1220.93310546875, + "completions/mean_terminated_length": 821.0927124023438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.07479622822438868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11183090081876049, + "kl": 0.00676727294921875, + "learning_rate": 7.446808510638297e-07, + "loss": 0.1427, + "num_tokens": 247589667.0, + "reward": 0.9542410969734192, + "reward_std": 0.49563080072402954, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4520089328289032, + "rewards/tag_count_reward/std": 0.3263011574745178, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1262.9754638671875, + "completions/mean_terminated_length": 898.6830444335938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.07500932289169464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11799169821100412, + "kl": 0.00638580322265625, + "learning_rate": 7.468085106382978e-07, + "loss": 0.0927, + "num_tokens": 248224008.0, + "reward": 0.9508929252624512, + "reward_std": 0.43274056911468506, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4620535671710968, + "rewards/tag_count_reward/std": 0.3254833221435547, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1456.3326416015625, + "completions/mean_terminated_length": 910.3734130859375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07522241755900058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1000269961608101, + "kl": 0.00495147705078125, + "learning_rate": 7.489361702127658e-07, + "loss": 0.1172, + "num_tokens": 248945453.0, + "reward": 0.7723214626312256, + "reward_std": 0.4526480436325073, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3794642984867096, + "rewards/tag_count_reward/std": 0.33483800292015076, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1194.634033203125, + "completions/mean_terminated_length": 806.740234375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.07543551222630654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15690249638555528, + "kl": 0.0087890625, + "learning_rate": 7.510638297872341e-07, + "loss": 0.1279, + "num_tokens": 249546825.0, + "reward": 0.934151828289032, + "reward_std": 0.4254299998283386, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4497767984867096, + "rewards/tag_count_reward/std": 0.32033950090408325, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1164.5223388671875, + "completions/mean_terminated_length": 852.2356567382812, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.07564860689361248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1230482488458399, + "kl": 0.0076751708984375, + "learning_rate": 7.531914893617021e-07, + "loss": 0.0735, + "num_tokens": 250134483.0, + "reward": 1.0412946939468384, + "reward_std": 0.45647132396698, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4832589328289032, + "rewards/tag_count_reward/std": 0.31373992562294006, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1340.3148193359375, + "completions/mean_terminated_length": 819.1511840820312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07586170156091844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10779877398978702, + "kl": 0.00525665283203125, + "learning_rate": 7.553191489361702e-07, + "loss": 0.1419, + "num_tokens": 250809264.0, + "reward": 0.7650669813156128, + "reward_std": 0.3833584189414978, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3833705484867096, + "rewards/tag_count_reward/std": 0.3368448317050934, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1298.4598388671875, + "completions/mean_terminated_length": 943.4144897460938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.07607479622822438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11427046399496257, + "kl": 0.00658416748046875, + "learning_rate": 7.574468085106383e-07, + "loss": 0.1082, + "num_tokens": 251466590.0, + "reward": 0.9587054252624512, + "reward_std": 0.47318387031555176, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4475446343421936, + "rewards/tag_count_reward/std": 0.3328317105770111, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1349.32373046875, + "completions/mean_terminated_length": 866.841552734375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.07628789089553034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.134753286835733, + "kl": 0.00582122802734375, + "learning_rate": 7.595744680851064e-07, + "loss": 0.154, + "num_tokens": 252140655.0, + "reward": 0.832589328289032, + "reward_std": 0.5002752542495728, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4263392984867096, + "rewards/tag_count_reward/std": 0.32579004764556885, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1452.9910888671875, + "completions/mean_terminated_length": 1014.8062133789062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.07650098556283629, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11440168843888172, + "kl": 0.00560760498046875, + "learning_rate": 7.617021276595744e-07, + "loss": 0.1012, + "num_tokens": 252859771.0, + "reward": 0.7996652126312256, + "reward_std": 0.5459848642349243, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4246651828289032, + "rewards/tag_count_reward/std": 0.31911492347717285, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1300.0223388671875, + "completions/mean_terminated_length": 868.091552734375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.07671408023014224, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7055710484810085, + "kl": 0.05333709716796875, + "learning_rate": 7.638297872340426e-07, + "loss": 0.0895, + "num_tokens": 253506245.0, + "reward": 0.8722098469734192, + "reward_std": 0.4105775058269501, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4592633843421936, + "rewards/tag_count_reward/std": 0.3240683078765869, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1335.087158203125, + "completions/mean_terminated_length": 869.4575805664062, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.07692717489744819, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4811192671746188, + "kl": 0.0089111328125, + "learning_rate": 7.659574468085106e-07, + "loss": 0.1555, + "num_tokens": 254170508.0, + "reward": 0.7656250596046448, + "reward_std": 0.4630606770515442, + "rewards/accuracy_reward/mean": 0.3680555522441864, + "rewards/accuracy_reward/std": 0.48283568024635315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4107142984867096, + "rewards/tag_count_reward/std": 0.33042433857917786, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1341.12060546875, + "completions/mean_terminated_length": 834.6589965820312, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.07714026956475414, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.7916810416441262, + "kl": 0.02782440185546875, + "learning_rate": 7.680851063829787e-07, + "loss": 0.1184, + "num_tokens": 254844706.0, + "reward": 0.8225446939468384, + "reward_std": 0.5005802512168884, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4185267984867096, + "rewards/tag_count_reward/std": 0.3337007761001587, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1180.279052734375, + "completions/mean_terminated_length": 855.549072265625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.07735336423206009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1488906691991433, + "kl": 0.00792694091796875, + "learning_rate": 7.702127659574467e-07, + "loss": 0.145, + "num_tokens": 255438815.0, + "reward": 1.07421875, + "reward_std": 0.4518907070159912, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.49609375, + "rewards/tag_count_reward/std": 0.32445329427719116, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1201.8170166015625, + "completions/mean_terminated_length": 844.5396728515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.07756645889936604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11845335589646902, + "kl": 0.00675201416015625, + "learning_rate": 7.723404255319148e-07, + "loss": 0.1603, + "num_tokens": 256049661.0, + "reward": 0.9363839626312256, + "reward_std": 0.4685453772544861, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4654017984867096, + "rewards/tag_count_reward/std": 0.31671810150146484, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1273.9129638671875, + "completions/mean_terminated_length": 839.6690063476562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.07777955356667199, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12278831565279281, + "kl": 0.006683349609375, + "learning_rate": 7.744680851063829e-07, + "loss": 0.1383, + "num_tokens": 256690502.0, + "reward": 0.902901828289032, + "reward_std": 0.47695741057395935, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4631696343421936, + "rewards/tag_count_reward/std": 0.3403094410896301, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1257.794677734375, + "completions/mean_terminated_length": 891.0980224609375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.07799264823397795, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13619905747048863, + "kl": 0.00888824462890625, + "learning_rate": 7.765957446808509e-07, + "loss": 0.1136, + "num_tokens": 257325002.0, + "reward": 0.949776828289032, + "reward_std": 0.3965776264667511, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4564732015132904, + "rewards/tag_count_reward/std": 0.3200119733810425, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1344.372802734375, + "completions/mean_terminated_length": 862.943603515625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.07820574290128389, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10920086514841651, + "kl": 0.0057830810546875, + "learning_rate": 7.787234042553192e-07, + "loss": 0.1375, + "num_tokens": 257997777.0, + "reward": 0.8348214626312256, + "reward_std": 0.46554961800575256, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4196428656578064, + "rewards/tag_count_reward/std": 0.3310283124446869, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1367.7501220703125, + "completions/mean_terminated_length": 884.8244018554688, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.07841883756858985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10960442022258464, + "kl": 0.0053558349609375, + "learning_rate": 7.808510638297872e-07, + "loss": 0.1203, + "num_tokens": 258676897.0, + "reward": 0.9062500596046448, + "reward_std": 0.45538076758384705, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4285714328289032, + "rewards/tag_count_reward/std": 0.3301219344139099, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1238.4576416015625, + "completions/mean_terminated_length": 870.4837646484375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07863193223589579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22078999102770358, + "kl": 0.0058441162109375, + "learning_rate": 7.829787234042553e-07, + "loss": 0.1295, + "num_tokens": 259307966.0, + "reward": 0.9095982313156128, + "reward_std": 0.4723241329193115, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4654017984867096, + "rewards/tag_count_reward/std": 0.3334912359714508, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1299.2567138671875, + "completions/mean_terminated_length": 875.1433715820312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07884502690320175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11692593864040696, + "kl": 0.0061187744140625, + "learning_rate": 7.851063829787234e-07, + "loss": 0.1285, + "num_tokens": 259960385.0, + "reward": 0.8816964626312256, + "reward_std": 0.4885033965110779, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4419642984867096, + "rewards/tag_count_reward/std": 0.32552167773246765, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1430.212158203125, + "completions/mean_terminated_length": 909.0328979492188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.07905812157050769, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11221244933418362, + "kl": 0.00559234619140625, + "learning_rate": 7.872340425531915e-07, + "loss": 0.1137, + "num_tokens": 260668080.0, + "reward": 0.8727679252624512, + "reward_std": 0.4812285900115967, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4263392984867096, + "rewards/tag_count_reward/std": 0.33090004324913025, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1363.3951416015625, + "completions/mean_terminated_length": 971.84912109375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.07927121623781365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11257000777919583, + "kl": 0.00624847412109375, + "learning_rate": 7.893617021276595e-07, + "loss": 0.1328, + "num_tokens": 261349329.0, + "reward": 0.9196429252624512, + "reward_std": 0.5100753903388977, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4486607015132904, + "rewards/tag_count_reward/std": 0.32579004764556885, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1178.2388916015625, + "completions/mean_terminated_length": 795.096435546875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.07948431090511959, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15866669420654644, + "kl": 0.0077972412109375, + "learning_rate": 7.914893617021276e-07, + "loss": 0.1657, + "num_tokens": 261946972.0, + "reward": 0.9241071939468384, + "reward_std": 0.42977941036224365, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4709821343421936, + "rewards/tag_count_reward/std": 0.3410661220550537, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1280.0535888671875, + "completions/mean_terminated_length": 877.7958984375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.07969740557242555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11136107292107865, + "kl": 0.00638580322265625, + "learning_rate": 7.936170212765957e-07, + "loss": 0.1159, + "num_tokens": 262588852.0, + "reward": 0.902901828289032, + "reward_std": 0.47068747878074646, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4743303656578064, + "rewards/tag_count_reward/std": 0.32367366552352905, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1107.930908203125, + "completions/mean_terminated_length": 834.308349609375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.0799105002397315, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12522299187898583, + "kl": 0.0082550048828125, + "learning_rate": 7.957446808510638e-07, + "loss": 0.14, + "num_tokens": 263157573.0, + "reward": 1.0848214626312256, + "reward_std": 0.4378872215747833, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5111607313156128, + "rewards/tag_count_reward/std": 0.2956402003765106, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1278.575927734375, + "completions/mean_terminated_length": 846.94775390625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08012359490703745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11331678560154371, + "kl": 0.0064697265625, + "learning_rate": 7.978723404255318e-07, + "loss": 0.1201, + "num_tokens": 263801735.0, + "reward": 1.0212054252624512, + "reward_std": 0.4924018383026123, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4609375, + "rewards/tag_count_reward/std": 0.34978869557380676, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1286.0513916015625, + "completions/mean_terminated_length": 928.809814453125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.08033668957434341, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11872886083967804, + "kl": 0.00699615478515625, + "learning_rate": 8e-07, + "loss": 0.1073, + "num_tokens": 264464190.0, + "reward": 0.9804688096046448, + "reward_std": 0.46269384026527405, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4737723171710968, + "rewards/tag_count_reward/std": 0.3298339545726776, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1228.247802734375, + "completions/mean_terminated_length": 777.2422485351562, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.08054978424164935, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11443456779144483, + "kl": 0.0071868896484375, + "learning_rate": 8.02127659574468e-07, + "loss": 0.07, + "num_tokens": 265082717.0, + "reward": 1.01953125, + "reward_std": 0.45115476846694946, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4771205484867096, + "rewards/tag_count_reward/std": 0.34418463706970215, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1256.899658203125, + "completions/mean_terminated_length": 846.5999755859375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.08076287890895531, + "frac_reward_zero_std": 0.0, + "grad_norm": 22.9217259742924, + "kl": 0.72265625, + "learning_rate": 8.042553191489362e-07, + "loss": 0.1353, + "num_tokens": 265718960.0, + "reward": 0.9330357313156128, + "reward_std": 0.44094958901405334, + "rewards/accuracy_reward/mean": 0.44907405972480774, + "rewards/accuracy_reward/std": 0.49797651171684265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.3083477020263672, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1213.4107666015625, + "completions/mean_terminated_length": 872.2263793945312, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.08097597357626125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4703130841075945, + "kl": 0.0083770751953125, + "learning_rate": 8.063829787234043e-07, + "loss": 0.1441, + "num_tokens": 266324504.0, + "reward": 1.0172991752624512, + "reward_std": 0.49286893010139465, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5061383843421936, + "rewards/tag_count_reward/std": 0.32912537455558777, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1299.75, + "completions/mean_terminated_length": 815.5882568359375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.08118906824356721, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12186164788933354, + "kl": 0.0063018798828125, + "learning_rate": 8.085106382978723e-07, + "loss": 0.144, + "num_tokens": 266981352.0, + "reward": 0.723214328289032, + "reward_std": 0.4090487062931061, + "rewards/accuracy_reward/mean": 0.3013392984867096, + "rewards/accuracy_reward/std": 0.4593527019023895, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.421875, + "rewards/tag_count_reward/std": 0.3311414122581482, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1177.821533203125, + "completions/mean_terminated_length": 794.4951782226562, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.08140216291087315, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13122956227799018, + "kl": 0.007293701171875, + "learning_rate": 8.106382978723404e-07, + "loss": 0.1642, + "num_tokens": 267578520.0, + "reward": 0.9447544813156128, + "reward_std": 0.4290495216846466, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5005580186843872, + "rewards/tag_count_reward/std": 0.3338218629360199, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1098.6920166015625, + "completions/mean_terminated_length": 839.789794921875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.08161525757817911, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12079459334840417, + "kl": 0.00794219970703125, + "learning_rate": 8.127659574468084e-07, + "loss": 0.0857, + "num_tokens": 268133614.0, + "reward": 0.9553571939468384, + "reward_std": 0.42930319905281067, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5089285969734192, + "rewards/tag_count_reward/std": 0.31935787200927734, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1216.0023193359375, + "completions/mean_terminated_length": 767.1237182617188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08182835224548506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12150887080201064, + "kl": 0.00646209716796875, + "learning_rate": 8.148936170212766e-07, + "loss": 0.1145, + "num_tokens": 268741199.0, + "reward": 0.8900669813156128, + "reward_std": 0.431190550327301, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4681919515132904, + "rewards/tag_count_reward/std": 0.32549819350242615, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1303.5804443359375, + "completions/mean_terminated_length": 877.8245849609375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.08204144691279101, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12152986193302466, + "kl": 0.00719451904296875, + "learning_rate": 8.170212765957446e-07, + "loss": 0.1204, + "num_tokens": 269390627.0, + "reward": 0.9084821939468384, + "reward_std": 0.45585721731185913, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4598214328289032, + "rewards/tag_count_reward/std": 0.3303336501121521, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1247.3929443359375, + "completions/mean_terminated_length": 912.9620361328125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.08225454158009696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12133284329664117, + "kl": 0.00803375244140625, + "learning_rate": 8.191489361702127e-07, + "loss": 0.1305, + "num_tokens": 270012099.0, + "reward": 1.0234375, + "reward_std": 0.5021721720695496, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5167410969734192, + "rewards/tag_count_reward/std": 0.31060466170310974, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1308.27685546875, + "completions/mean_terminated_length": 935.932861328125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.08246763624740291, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14095168509221923, + "kl": 0.0076904296875, + "learning_rate": 8.212765957446808e-07, + "loss": 0.0976, + "num_tokens": 270665391.0, + "reward": 0.9006696939468384, + "reward_std": 0.41965097188949585, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4765625, + "rewards/tag_count_reward/std": 0.34516119956970215, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 999.3616333007812, + "completions/mean_terminated_length": 788.5093994140625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08268073091470886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1397852214011627, + "kl": 0.0105133056640625, + "learning_rate": 8.234042553191489e-07, + "loss": 0.1621, + "num_tokens": 271181553.0, + "reward": 1.2020089626312256, + "reward_std": 0.47391000390052795, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5904017686843872, + "rewards/tag_count_reward/std": 0.3059394359588623, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1274.9085693359375, + "completions/mean_terminated_length": 889.655517578125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.08289382558201482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14044456729494353, + "kl": 0.00778961181640625, + "learning_rate": 8.255319148936169e-07, + "loss": 0.1385, + "num_tokens": 271826760.0, + "reward": 0.8967634439468384, + "reward_std": 0.47555842995643616, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.51953125, + "rewards/tag_count_reward/std": 0.33575716614723206, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1109.7254638671875, + "completions/mean_terminated_length": 800.6795043945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.08310692024932076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13310501145274745, + "kl": 0.00927734375, + "learning_rate": 8.27659574468085e-07, + "loss": 0.1175, + "num_tokens": 272390493.0, + "reward": 1.1428571939468384, + "reward_std": 0.4624865651130676, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5558035969734192, + "rewards/tag_count_reward/std": 0.3385384678840637, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1143.279052734375, + "completions/mean_terminated_length": 800.876953125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.08332001491662672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12331078161692945, + "kl": 0.008148193359375, + "learning_rate": 8.297872340425532e-07, + "loss": 0.1217, + "num_tokens": 272974794.0, + "reward": 1.0106027126312256, + "reward_std": 0.45167678594589233, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5329241156578064, + "rewards/tag_count_reward/std": 0.33261173963546753, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1165.609375, + "completions/mean_terminated_length": 756.1339721679688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08353310958393266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1276473587656498, + "kl": 0.0080413818359375, + "learning_rate": 8.319148936170213e-07, + "loss": 0.1087, + "num_tokens": 273562811.0, + "reward": 0.9268973469734192, + "reward_std": 0.4446876645088196, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5139508843421936, + "rewards/tag_count_reward/std": 0.33810997009277344, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1237.982177734375, + "completions/mean_terminated_length": 809.474365234375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08374620425123862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3691404829409415, + "kl": 0.008453369140625, + "learning_rate": 8.340425531914893e-07, + "loss": 0.1421, + "num_tokens": 274184147.0, + "reward": 0.9921875596046448, + "reward_std": 0.41376620531082153, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5301339030265808, + "rewards/tag_count_reward/std": 0.32886216044425964, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1371.4285888671875, + "completions/mean_terminated_length": 904.2113037109375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08395929891854456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10926351966410949, + "kl": 0.0069122314453125, + "learning_rate": 8.361702127659575e-07, + "loss": 0.1233, + "num_tokens": 274869747.0, + "reward": 0.9408482313156128, + "reward_std": 0.5249491930007935, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4698660671710968, + "rewards/tag_count_reward/std": 0.34787043929100037, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1208.2054443359375, + "completions/mean_terminated_length": 793.9066772460938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.08417239358585052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1314812214291103, + "kl": 0.00891876220703125, + "learning_rate": 8.382978723404255e-07, + "loss": 0.1259, + "num_tokens": 275483007.0, + "reward": 0.9838169813156128, + "reward_std": 0.45798468589782715, + "rewards/accuracy_reward/mean": 0.4513888955116272, + "rewards/accuracy_reward/std": 0.49820831418037415, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5485491156578064, + "rewards/tag_count_reward/std": 0.3431383967399597, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1063.75, + "completions/mean_terminated_length": 754.9091186523438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08438548825315646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14201382376296273, + "kl": 0.009429931640625, + "learning_rate": 8.404255319148936e-07, + "loss": 0.0703, + "num_tokens": 276025615.0, + "reward": 1.0044643878936768, + "reward_std": 0.45521751046180725, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5446428656578064, + "rewards/tag_count_reward/std": 0.3454955816268921, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1212.52685546875, + "completions/mean_terminated_length": 783.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08459858292046242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1275173509121519, + "kl": 0.0084228515625, + "learning_rate": 8.425531914893617e-07, + "loss": 0.1178, + "num_tokens": 276643339.0, + "reward": 0.8956473469734192, + "reward_std": 0.4494919180870056, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5139508843421936, + "rewards/tag_count_reward/std": 0.3558407127857208, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1137.7879638671875, + "completions/mean_terminated_length": 753.4761962890625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.08481167758776836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11779748853008372, + "kl": 0.009307861328125, + "learning_rate": 8.446808510638298e-07, + "loss": 0.157, + "num_tokens": 277214300.0, + "reward": 0.9871652126312256, + "reward_std": 0.43661531805992126, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.55859375, + "rewards/tag_count_reward/std": 0.3403328061103821, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 943.2567138671875, + "completions/mean_terminated_length": 742.1293334960938, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.08502477225507432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1400392652122895, + "kl": 0.011993408203125, + "learning_rate": 8.468085106382978e-07, + "loss": 0.1528, + "num_tokens": 277706623.0, + "reward": 1.1149554252624512, + "reward_std": 0.4290737509727478, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6350446343421936, + "rewards/tag_count_reward/std": 0.2985040843486786, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1140.138427734375, + "completions/mean_terminated_length": 735.9935302734375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08523786692238026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13630200511551344, + "kl": 0.00958251953125, + "learning_rate": 8.489361702127658e-07, + "loss": 0.1599, + "num_tokens": 278286909.0, + "reward": 1.0418527126312256, + "reward_std": 0.444623738527298, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5530133843421936, + "rewards/tag_count_reward/std": 0.3333728015422821, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1225.587158203125, + "completions/mean_terminated_length": 843.9444580078125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.08545096158968622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12969004505965936, + "kl": 0.0097198486328125, + "learning_rate": 8.51063829787234e-07, + "loss": 0.2023, + "num_tokens": 278903140.0, + "reward": 0.914620578289032, + "reward_std": 0.5368692874908447, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5530133843421936, + "rewards/tag_count_reward/std": 0.3329531252384186, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1154.821533203125, + "completions/mean_terminated_length": 793.6300659179688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.08566405625699217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14148337863327637, + "kl": 0.010711669921875, + "learning_rate": 8.53191489361702e-07, + "loss": 0.2116, + "num_tokens": 279488196.0, + "reward": 1.015625, + "reward_std": 0.5095352530479431, + "rewards/accuracy_reward/mean": 0.44675925374031067, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5848214030265808, + "rewards/tag_count_reward/std": 0.3173815906047821, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1362.4910888671875, + "completions/mean_terminated_length": 862.2548217773438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.08587715092429812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1869206375790302, + "kl": 0.0090484619140625, + "learning_rate": 8.553191489361702e-07, + "loss": 0.1735, + "num_tokens": 280171312.0, + "reward": 0.9347098469734192, + "reward_std": 0.4820769131183624, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5329241156578064, + "rewards/tag_count_reward/std": 0.349021852016449, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1171.71875, + "completions/mean_terminated_length": 797.7643432617188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.08609024559160407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8501290762371464, + "kl": 0.0127410888671875, + "learning_rate": 8.574468085106383e-07, + "loss": 0.1794, + "num_tokens": 280765282.0, + "reward": 1.0541294813156128, + "reward_std": 0.48049402236938477, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5786830186843872, + "rewards/tag_count_reward/std": 0.3440576493740082, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1261.055908203125, + "completions/mean_terminated_length": 895.87255859375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.08630334025891002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11630979067192901, + "kl": 0.0093841552734375, + "learning_rate": 8.595744680851064e-07, + "loss": 0.1694, + "num_tokens": 281403435.0, + "reward": 0.9893973469734192, + "reward_std": 0.45581328868865967, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5719866156578064, + "rewards/tag_count_reward/std": 0.3414534330368042, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1240.622802734375, + "completions/mean_terminated_length": 677.9053344726562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08651643492621597, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19638996721638072, + "kl": 0.0095977783203125, + "learning_rate": 8.617021276595744e-07, + "loss": 0.1741, + "num_tokens": 282027266.0, + "reward": 0.8750000596046448, + "reward_std": 0.48078683018684387, + "rewards/accuracy_reward/mean": 0.3482142984867096, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5267857313156128, + "rewards/tag_count_reward/std": 0.3469379246234894, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1002.6875610351562, + "completions/mean_terminated_length": 713.8119506835938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08672952959352193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13872811441735106, + "kl": 0.0122833251953125, + "learning_rate": 8.638297872340426e-07, + "loss": 0.1853, + "num_tokens": 282543286.0, + "reward": 1.1763393878936768, + "reward_std": 0.4613015651702881, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.6383928656578064, + "rewards/tag_count_reward/std": 0.31612035632133484, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1293.90185546875, + "completions/mean_terminated_length": 898.89794921875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.08694262426082787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11885463751958361, + "kl": 0.0086822509765625, + "learning_rate": 8.659574468085106e-07, + "loss": 0.139, + "num_tokens": 283200762.0, + "reward": 0.8621652126312256, + "reward_std": 0.4854860305786133, + "rewards/accuracy_reward/mean": 0.3147321343421936, + "rewards/accuracy_reward/std": 0.4649282693862915, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5474330186843872, + "rewards/tag_count_reward/std": 0.3325366675853729, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1270.34375, + "completions/mean_terminated_length": 821.274658203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.08715571892813383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11533406625901478, + "kl": 0.0095672607421875, + "learning_rate": 8.680851063829787e-07, + "loss": 0.1466, + "num_tokens": 283840100.0, + "reward": 1.0100446939468384, + "reward_std": 0.48490050435066223, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5591517686843872, + "rewards/tag_count_reward/std": 0.3433043658733368, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1072.841552734375, + "completions/mean_terminated_length": 747.7886962890625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08736881359543977, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1545740167944373, + "kl": 0.01397705078125, + "learning_rate": 8.702127659574467e-07, + "loss": 0.075, + "num_tokens": 284386573.0, + "reward": 1.1568081378936768, + "reward_std": 0.3985455632209778, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6077008843421936, + "rewards/tag_count_reward/std": 0.32978853583335876, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1160.9263916015625, + "completions/mean_terminated_length": 790.3765869140625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.08758190826274573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12193730660863826, + "kl": 0.0103912353515625, + "learning_rate": 8.723404255319149e-07, + "loss": 0.1217, + "num_tokens": 284977004.0, + "reward": 0.9720982313156128, + "reward_std": 0.5002922415733337, + "rewards/accuracy_reward/mean": 0.43981480598449707, + "rewards/accuracy_reward/std": 0.496940016746521, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5479910969734192, + "rewards/tag_count_reward/std": 0.34260547161102295, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1172.357177734375, + "completions/mean_terminated_length": 744.7175903320312, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.08779500293005167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12197006773415212, + "kl": 0.009246826171875, + "learning_rate": 8.744680851063829e-07, + "loss": 0.1893, + "num_tokens": 285570876.0, + "reward": 0.906808078289032, + "reward_std": 0.45161154866218567, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5474330186843872, + "rewards/tag_count_reward/std": 0.32830509543418884, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1158.34375, + "completions/mean_terminated_length": 790.6940307617188, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.08800809759735763, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12373033503104719, + "kl": 0.009521484375, + "learning_rate": 8.76595744680851e-07, + "loss": 0.132, + "num_tokens": 286169446.0, + "reward": 0.9871652126312256, + "reward_std": 0.4302607476711273, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5943080186843872, + "rewards/tag_count_reward/std": 0.3309297561645508, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1150.841552734375, + "completions/mean_terminated_length": 759.7724609375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.08822119226466357, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12886624788073314, + "kl": 0.009368896484375, + "learning_rate": 8.787234042553191e-07, + "loss": 0.2051, + "num_tokens": 286756927.0, + "reward": 1.01171875, + "reward_std": 0.45868971943855286, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.3219386637210846, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1145.404052734375, + "completions/mean_terminated_length": 826.3594970703125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.08843428693196953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47207680286426645, + "kl": 0.0138397216796875, + "learning_rate": 8.808510638297872e-07, + "loss": 0.1608, + "num_tokens": 287335924.0, + "reward": 1.1696429252624512, + "reward_std": 0.46191391348838806, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6116071343421936, + "rewards/tag_count_reward/std": 0.32527613639831543, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1378.9398193359375, + "completions/mean_terminated_length": 958.0399780273438, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.08864738159927547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10072030654047953, + "kl": 0.0084075927734375, + "learning_rate": 8.829787234042553e-07, + "loss": 0.095, + "num_tokens": 288023257.0, + "reward": 0.9553571939468384, + "reward_std": 0.457049161195755, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5602678656578064, + "rewards/tag_count_reward/std": 0.33023539185523987, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1267.07373046875, + "completions/mean_terminated_length": 862.0508422851562, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.08886047626658143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14592557482538301, + "kl": 0.01007080078125, + "learning_rate": 8.851063829787234e-07, + "loss": 0.1525, + "num_tokens": 288667178.0, + "reward": 0.9698660969734192, + "reward_std": 0.48869234323501587, + "rewards/accuracy_reward/mean": 0.43518519401550293, + "rewards/accuracy_reward/std": 0.4963560700416565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5502232313156128, + "rewards/tag_count_reward/std": 0.34269291162490845, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1056.7567138671875, + "completions/mean_terminated_length": 807.5614013671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08907357093388737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1344909011669105, + "kl": 0.0123443603515625, + "learning_rate": 8.872340425531915e-07, + "loss": 0.1524, + "num_tokens": 289206541.0, + "reward": 1.1969866752624512, + "reward_std": 0.4307486116886139, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6590401530265808, + "rewards/tag_count_reward/std": 0.30918413400650024, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1324.7076416015625, + "completions/mean_terminated_length": 907.0316772460938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.08928666560119333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15712268060789628, + "kl": 0.0111846923828125, + "learning_rate": 8.893617021276595e-07, + "loss": 0.1069, + "num_tokens": 289867674.0, + "reward": 0.9095982313156128, + "reward_std": 0.5182147026062012, + "rewards/accuracy_reward/mean": 0.3504464328289032, + "rewards/accuracy_reward/std": 0.47764310240745544, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5591517686843872, + "rewards/tag_count_reward/std": 0.35135555267333984, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1142.4710693359375, + "completions/mean_terminated_length": 743.5723266601562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.08949976026849928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12941238973364502, + "kl": 0.011871337890625, + "learning_rate": 8.914893617021276e-07, + "loss": 0.1273, + "num_tokens": 290454685.0, + "reward": 1.020647406578064, + "reward_std": 0.44607022404670715, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6032366156578064, + "rewards/tag_count_reward/std": 0.3295234441757202, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1242.9285888671875, + "completions/mean_terminated_length": 938.239990234375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.08971285493580523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12109964315197068, + "kl": 0.01214599609375, + "learning_rate": 8.936170212765957e-07, + "loss": 0.1435, + "num_tokens": 291081229.0, + "reward": 1.1824777126312256, + "reward_std": 0.49143338203430176, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6311383843421936, + "rewards/tag_count_reward/std": 0.3454987406730652, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1093.810302734375, + "completions/mean_terminated_length": 752.6151123046875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.08992594960311118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4082328573676096, + "kl": 0.0132904052734375, + "learning_rate": 8.957446808510638e-07, + "loss": 0.1402, + "num_tokens": 291639992.0, + "reward": 1.223772406578064, + "reward_std": 0.4036027491092682, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.32790935039520264, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1170.25, + "completions/mean_terminated_length": 807.5205078125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.09013904427041713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13500643030383605, + "kl": 0.0111236572265625, + "learning_rate": 8.978723404255318e-07, + "loss": 0.179, + "num_tokens": 292243736.0, + "reward": 1.0535714626312256, + "reward_std": 0.46681398153305054, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5825892686843872, + "rewards/tag_count_reward/std": 0.33509889245033264, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1269.2545166015625, + "completions/mean_terminated_length": 900.375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.09035213893772308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12209733702831041, + "kl": 0.010467529296875, + "learning_rate": 9e-07, + "loss": 0.1315, + "num_tokens": 292878394.0, + "reward": 1.0150669813156128, + "reward_std": 0.45642924308776855, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5641741156578064, + "rewards/tag_count_reward/std": 0.3442317843437195, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1259.69873046875, + "completions/mean_terminated_length": 838.5513916015625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.09056523360502904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1176347988210879, + "kl": 0.01043701171875, + "learning_rate": 9.02127659574468e-07, + "loss": 0.1184, + "num_tokens": 293511731.0, + "reward": 0.9235491752624512, + "reward_std": 0.42911988496780396, + "rewards/accuracy_reward/mean": 0.34490740299224854, + "rewards/accuracy_reward/std": 0.4758892059326172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5909598469734192, + "rewards/tag_count_reward/std": 0.33312928676605225, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1104.65185546875, + "completions/mean_terminated_length": 840.5142822265625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09077832827233498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4159101694611062, + "kl": 0.0137939453125, + "learning_rate": 9.042553191489361e-07, + "loss": 0.1367, + "num_tokens": 294079319.0, + "reward": 1.2299107313156128, + "reward_std": 0.44114354252815247, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6428571343421936, + "rewards/tag_count_reward/std": 0.3259202539920807, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1154.375, + "completions/mean_terminated_length": 796.9249877929688, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.09099142293964094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5289366488765045, + "kl": 0.014892578125, + "learning_rate": 9.063829787234041e-07, + "loss": 0.0995, + "num_tokens": 294667391.0, + "reward": 1.0920759439468384, + "reward_std": 0.42149776220321655, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6032366156578064, + "rewards/tag_count_reward/std": 0.32136034965515137, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1220.962158203125, + "completions/mean_terminated_length": 783.4505004882812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.09120451760694688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12314840787364822, + "kl": 0.0100250244140625, + "learning_rate": 9.085106382978724e-07, + "loss": 0.1711, + "num_tokens": 295292734.0, + "reward": 1.0703125, + "reward_std": 0.4780718684196472, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5881696343421936, + "rewards/tag_count_reward/std": 0.3520229160785675, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1210.72998046875, + "completions/mean_terminated_length": 805.9569702148438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.09141761227425284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11447318591624865, + "kl": 0.0113983154296875, + "learning_rate": 9.106382978723404e-07, + "loss": 0.1264, + "num_tokens": 295900133.0, + "reward": 1.1227679252624512, + "reward_std": 0.4260932505130768, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6116071343421936, + "rewards/tag_count_reward/std": 0.3370972275733948, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1145.3638916015625, + "completions/mean_terminated_length": 840.8925170898438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.09163070694155878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12672488335703755, + "kl": 0.0112457275390625, + "learning_rate": 9.127659574468085e-07, + "loss": 0.118, + "num_tokens": 296480952.0, + "reward": 1.1060268878936768, + "reward_std": 0.4731019139289856, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6149553656578064, + "rewards/tag_count_reward/std": 0.3241053819656372, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1232.7545166015625, + "completions/mean_terminated_length": 854.4379272460938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.09184380160886474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9810241276347809, + "kl": 0.0146026611328125, + "learning_rate": 9.148936170212766e-07, + "loss": 0.1508, + "num_tokens": 297098074.0, + "reward": 1.1010044813156128, + "reward_std": 0.4596308171749115, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6277901530265808, + "rewards/tag_count_reward/std": 0.3361065089702606, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1212.122802734375, + "completions/mean_terminated_length": 791.3792114257812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09205689627617068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1291479991254237, + "kl": 0.0109710693359375, + "learning_rate": 9.170212765957447e-07, + "loss": 0.1452, + "num_tokens": 297712801.0, + "reward": 1.0580357313156128, + "reward_std": 0.44149741530418396, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5758928656578064, + "rewards/tag_count_reward/std": 0.35051780939102173, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1124.62060546875, + "completions/mean_terminated_length": 650.4526977539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.09226999094347664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14046931841678065, + "kl": 0.0123443603515625, + "learning_rate": 9.191489361702127e-07, + "loss": 0.1474, + "num_tokens": 298284343.0, + "reward": 1.01953125, + "reward_std": 0.4352644979953766, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6177455186843872, + "rewards/tag_count_reward/std": 0.33016306161880493, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1054.453125, + "completions/mean_terminated_length": 801.1961059570312, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.09248308561078258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13874348920062532, + "kl": 0.0171661376953125, + "learning_rate": 9.212765957446809e-07, + "loss": 0.1568, + "num_tokens": 298826514.0, + "reward": 1.2566964626312256, + "reward_std": 0.3902831971645355, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.3065285086631775, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1261.3660888671875, + "completions/mean_terminated_length": 836.9622192382812, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09269618027808854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1164956345337005, + "kl": 0.0110015869140625, + "learning_rate": 9.234042553191489e-07, + "loss": 0.1307, + "num_tokens": 299457750.0, + "reward": 0.9972098469734192, + "reward_std": 0.46054917573928833, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.348928838968277, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1256.77685546875, + "completions/mean_terminated_length": 842.3265380859375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.09290927494539448, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.228366832954742, + "kl": 0.0327606201171875, + "learning_rate": 9.255319148936169e-07, + "loss": 0.1244, + "num_tokens": 300097730.0, + "reward": 1.0602679252624512, + "reward_std": 0.5038176774978638, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6026785969734192, + "rewards/tag_count_reward/std": 0.35085955262184143, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1191.743408203125, + "completions/mean_terminated_length": 692.5123901367188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09312236961270044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11963888583713599, + "kl": 0.009979248046875, + "learning_rate": 9.27659574468085e-07, + "loss": 0.1469, + "num_tokens": 300704015.0, + "reward": 0.9648438096046448, + "reward_std": 0.3931638300418854, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5965401530265808, + "rewards/tag_count_reward/std": 0.33155161142349243, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1266.8013916015625, + "completions/mean_terminated_length": 815.6865844726562, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.09333546428000639, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11927082869488527, + "kl": 0.012115478515625, + "learning_rate": 9.297872340425531e-07, + "loss": 0.1345, + "num_tokens": 301341686.0, + "reward": 1.0904018878936768, + "reward_std": 0.4178585410118103, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6082589030265808, + "rewards/tag_count_reward/std": 0.3570931553840637, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1063.294677734375, + "completions/mean_terminated_length": 742.8284301757812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09354855894731234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14369039189530988, + "kl": 0.01312255859375, + "learning_rate": 9.319148936170212e-07, + "loss": 0.1522, + "num_tokens": 301888826.0, + "reward": 1.1127232313156128, + "reward_std": 0.38257989287376404, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6551339030265808, + "rewards/tag_count_reward/std": 0.3154067397117615, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1170.6585693359375, + "completions/mean_terminated_length": 811.996826171875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.0937616536146183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30689896878613543, + "kl": 0.0345458984375, + "learning_rate": 9.340425531914892e-07, + "loss": 0.1494, + "num_tokens": 302486081.0, + "reward": 1.1177456378936768, + "reward_std": 0.440822958946228, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6735491156578064, + "rewards/tag_count_reward/std": 0.3244994580745697, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1320.1295166015625, + "completions/mean_terminated_length": 862.2327270507812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09397474828192424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12317266904399936, + "kl": 0.01263427734375, + "learning_rate": 9.361702127659575e-07, + "loss": 0.1497, + "num_tokens": 303149531.0, + "reward": 0.9748884439468384, + "reward_std": 0.5206019282341003, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5909598469734192, + "rewards/tag_count_reward/std": 0.3462999761104584, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1245.7545166015625, + "completions/mean_terminated_length": 910.6392822265625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0941878429492302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1287317155890944, + "kl": 0.01409912109375, + "learning_rate": 9.382978723404255e-07, + "loss": 0.0887, + "num_tokens": 303771645.0, + "reward": 1.140625, + "reward_std": 0.49231529235839844, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6741071343421936, + "rewards/tag_count_reward/std": 0.3205128610134125, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1065.2210693359375, + "completions/mean_terminated_length": 790.0428466796875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.09440093761653615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12968192211795326, + "kl": 0.013031005859375, + "learning_rate": 9.404255319148936e-07, + "loss": 0.0749, + "num_tokens": 304320176.0, + "reward": 1.1891741752624512, + "reward_std": 0.41448912024497986, + "rewards/accuracy_reward/mean": 0.5069444179534912, + "rewards/accuracy_reward/std": 0.5005314350128174, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7003348469734192, + "rewards/tag_count_reward/std": 0.31977149844169617, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1014.4397583007812, + "completions/mean_terminated_length": 765.3545532226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0946140322838421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14759827614171392, + "kl": 0.0149688720703125, + "learning_rate": 9.425531914893617e-07, + "loss": 0.1356, + "num_tokens": 304850117.0, + "reward": 1.2293527126312256, + "reward_std": 0.38068321347236633, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6981026530265808, + "rewards/tag_count_reward/std": 0.31941601634025574, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1122.930908203125, + "completions/mean_terminated_length": 807.1886596679688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09482712695114805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7748710603056193, + "kl": 0.01385498046875, + "learning_rate": 9.446808510638298e-07, + "loss": 0.1483, + "num_tokens": 305424550.0, + "reward": 1.1696429252624512, + "reward_std": 0.48740342259407043, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6763392686843872, + "rewards/tag_count_reward/std": 0.32146963477134705, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1131.49560546875, + "completions/mean_terminated_length": 756.8238525390625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.095040221618454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13267983887980828, + "kl": 0.011871337890625, + "learning_rate": 9.468085106382978e-07, + "loss": 0.1591, + "num_tokens": 306001844.0, + "reward": 1.0390625, + "reward_std": 0.423543781042099, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6462053656578064, + "rewards/tag_count_reward/std": 0.34123262763023376, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1163.99560546875, + "completions/mean_terminated_length": 883.1941528320312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.09525331628575995, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12894255539075125, + "kl": 0.0131988525390625, + "learning_rate": 9.489361702127659e-07, + "loss": 0.1404, + "num_tokens": 306595266.0, + "reward": 1.2405134439468384, + "reward_std": 0.46087321639060974, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6802455186843872, + "rewards/tag_count_reward/std": 0.3229917287826538, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1079.9442138671875, + "completions/mean_terminated_length": 701.1397705078125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.0954664109530659, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13842151229941713, + "kl": 0.0139923095703125, + "learning_rate": 9.51063829787234e-07, + "loss": 0.1134, + "num_tokens": 307150633.0, + "reward": 1.1908482313156128, + "reward_std": 0.38598817586898804, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6662946343421936, + "rewards/tag_count_reward/std": 0.3194340765476227, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1297.8192138671875, + "completions/mean_terminated_length": 885.0899658203125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.09567950562037185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12584569530561895, + "kl": 0.0123138427734375, + "learning_rate": 9.531914893617021e-07, + "loss": 0.1527, + "num_tokens": 307801240.0, + "reward": 1.0100446939468384, + "reward_std": 0.4523391127586365, + "rewards/accuracy_reward/mean": 0.39120370149612427, + "rewards/accuracy_reward/std": 0.4885856807231903, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.34424853324890137, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1079.4732666015625, + "completions/mean_terminated_length": 793.9537353515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0958926002876778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14007582337384927, + "kl": 0.0145263671875, + "learning_rate": 9.553191489361702e-07, + "loss": 0.1058, + "num_tokens": 308358556.0, + "reward": 1.1997768878936768, + "reward_std": 0.42876285314559937, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6752232313156128, + "rewards/tag_count_reward/std": 0.30605366826057434, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1064.977783203125, + "completions/mean_terminated_length": 803.9491577148438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09610569495498375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1415933337491164, + "kl": 0.01531982421875, + "learning_rate": 9.574468085106384e-07, + "loss": 0.1308, + "num_tokens": 308907538.0, + "reward": 1.30859375, + "reward_std": 0.4190255403518677, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035415053367615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7081473469734192, + "rewards/tag_count_reward/std": 0.31428611278533936, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1154.0201416015625, + "completions/mean_terminated_length": 852.4686889648438, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.09631878962228971, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11258313246598783, + "kl": 0.013702392578125, + "learning_rate": 9.595744680851063e-07, + "loss": 0.1297, + "num_tokens": 309486011.0, + "reward": 1.1305804252624512, + "reward_std": 0.47361451387405396, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.32834547758102417, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1150.5648193359375, + "completions/mean_terminated_length": 829.6636352539062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09653188428959565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3226003479524171, + "kl": 0.0183868408203125, + "learning_rate": 9.617021276595744e-07, + "loss": 0.1645, + "num_tokens": 310072664.0, + "reward": 1.0920759439468384, + "reward_std": 0.42832887172698975, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6612723469734192, + "rewards/tag_count_reward/std": 0.33540377020835876, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1138.0357666015625, + "completions/mean_terminated_length": 801.3211059570312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.09674497895690161, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1297357108723474, + "kl": 0.012603759765625, + "learning_rate": 9.638297872340426e-07, + "loss": 0.1127, + "num_tokens": 310656568.0, + "reward": 1.0809152126312256, + "reward_std": 0.45041897892951965, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6188616156578064, + "rewards/tag_count_reward/std": 0.3434693217277527, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1274.9129638671875, + "completions/mean_terminated_length": 901.1688842773438, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.09695807362420755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11556313160924528, + "kl": 0.010711669921875, + "learning_rate": 9.659574468085105e-07, + "loss": 0.176, + "num_tokens": 311300625.0, + "reward": 0.9012277126312256, + "reward_std": 0.435077428817749, + "rewards/accuracy_reward/mean": 0.2946428656578064, + "rewards/accuracy_reward/std": 0.45639166235923767, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6065848469734192, + "rewards/tag_count_reward/std": 0.3347778618335724, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1185.453125, + "completions/mean_terminated_length": 859.0123291015625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09717116829151351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7537882718330753, + "kl": 0.0487060546875, + "learning_rate": 9.680851063829786e-07, + "loss": 0.1028, + "num_tokens": 311899996.0, + "reward": 1.090959906578064, + "reward_std": 0.41947564482688904, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6646205186843872, + "rewards/tag_count_reward/std": 0.3200836777687073, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1137.2879638671875, + "completions/mean_terminated_length": 776.9750366210938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.09738426295881945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1380195687610888, + "kl": 0.0118865966796875, + "learning_rate": 9.702127659574467e-07, + "loss": 0.0913, + "num_tokens": 312474557.0, + "reward": 1.079241156578064, + "reward_std": 0.4124201536178589, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6439732313156128, + "rewards/tag_count_reward/std": 0.31935593485832214, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1189.544677734375, + "completions/mean_terminated_length": 807.3935546875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.09759735762612541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12936722184663546, + "kl": 0.012237548828125, + "learning_rate": 9.723404255319149e-07, + "loss": 0.1174, + "num_tokens": 313077569.0, + "reward": 1.14453125, + "reward_std": 0.4015459716320038, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6311383843421936, + "rewards/tag_count_reward/std": 0.3339751660823822, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1251.747802734375, + "completions/mean_terminated_length": 874.57568359375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09781045229343135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12649023922652772, + "kl": 0.011260986328125, + "learning_rate": 9.74468085106383e-07, + "loss": 0.1283, + "num_tokens": 313714016.0, + "reward": 1.046875, + "reward_std": 0.4707343876361847, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.328119158744812, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1210.9442138671875, + "completions/mean_terminated_length": 883.400634765625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09802354696073731, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12754187424830113, + "kl": 0.0118865966796875, + "learning_rate": 9.765957446808511e-07, + "loss": 0.1364, + "num_tokens": 314327303.0, + "reward": 1.0770089626312256, + "reward_std": 0.48893120884895325, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.494759202003479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6529017686843872, + "rewards/tag_count_reward/std": 0.3133895695209503, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1040.7366943359375, + "completions/mean_terminated_length": 783.9832153320312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.09823664162804326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14569327225511766, + "kl": 0.0135345458984375, + "learning_rate": 9.78723404255319e-07, + "loss": 0.1423, + "num_tokens": 314860753.0, + "reward": 1.1852679252624512, + "reward_std": 0.4032217562198639, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6808035969734192, + "rewards/tag_count_reward/std": 0.322462260723114, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1013.88623046875, + "completions/mean_terminated_length": 750.2885131835938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.09844973629534921, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15114678144959096, + "kl": 0.0137481689453125, + "learning_rate": 9.808510638297872e-07, + "loss": 0.191, + "num_tokens": 315383966.0, + "reward": 1.1852679252624512, + "reward_std": 0.45902127027511597, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6852678656578064, + "rewards/tag_count_reward/std": 0.3373563885688782, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1179.6116943359375, + "completions/mean_terminated_length": 869.096923828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.09866283096265516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1315273401210009, + "kl": 0.012359619140625, + "learning_rate": 9.829787234042553e-07, + "loss": 0.1383, + "num_tokens": 315984944.0, + "reward": 1.078125, + "reward_std": 0.4403482973575592, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6316964030265808, + "rewards/tag_count_reward/std": 0.31456056237220764, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1047.1473388671875, + "completions/mean_terminated_length": 792.0280151367188, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.09887592562996111, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3075517367740281, + "kl": 0.0149078369140625, + "learning_rate": 9.851063829787235e-07, + "loss": 0.1517, + "num_tokens": 316521746.0, + "reward": 1.1205357313156128, + "reward_std": 0.4365750253200531, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6897321343421936, + "rewards/tag_count_reward/std": 0.31328800320625305, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1112.5804443359375, + "completions/mean_terminated_length": 800.7738037109375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.09908902029726706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12263570237272706, + "kl": 0.01287841796875, + "learning_rate": 9.872340425531914e-07, + "loss": 0.0933, + "num_tokens": 317084486.0, + "reward": 1.1238839626312256, + "reward_std": 0.40685784816741943, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.3129750192165375, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1125.5826416015625, + "completions/mean_terminated_length": 776.4830932617188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09930211496457302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26723573147817115, + "kl": 0.0135955810546875, + "learning_rate": 9.893617021276595e-07, + "loss": 0.1407, + "num_tokens": 317654155.0, + "reward": 1.203125, + "reward_std": 0.447807639837265, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6897321343421936, + "rewards/tag_count_reward/std": 0.32034924626350403, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1260.8125, + "completions/mean_terminated_length": 924.8789672851562, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.09951520963187896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11900629784858324, + "kl": 0.0116119384765625, + "learning_rate": 9.914893617021276e-07, + "loss": 0.1267, + "num_tokens": 318304967.0, + "reward": 0.9441964626312256, + "reward_std": 0.39277732372283936, + "rewards/accuracy_reward/mean": 0.3236607015132904, + "rewards/accuracy_reward/std": 0.46839529275894165, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6205357313156128, + "rewards/tag_count_reward/std": 0.31769606471061707, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1018.685302734375, + "completions/mean_terminated_length": 726.7020263671875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.09972830429918492, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16530537406241555, + "kl": 0.015228271484375, + "learning_rate": 9.936170212765958e-07, + "loss": 0.1576, + "num_tokens": 318829402.0, + "reward": 1.1969866752624512, + "reward_std": 0.38769274950027466, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7036830186843872, + "rewards/tag_count_reward/std": 0.3141033351421356, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1207.7054443359375, + "completions/mean_terminated_length": 845.2779541015625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.09994139896649086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13483532352460165, + "kl": 0.01446533203125, + "learning_rate": 9.957446808510637e-07, + "loss": 0.1468, + "num_tokens": 319440150.0, + "reward": 1.0524554252624512, + "reward_std": 0.4234815537929535, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6216517686843872, + "rewards/tag_count_reward/std": 0.3365171253681183, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1105.555908203125, + "completions/mean_terminated_length": 736.7733154296875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.10015449363379682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1638360128156321, + "kl": 0.013336181640625, + "learning_rate": 9.978723404255318e-07, + "loss": 0.1397, + "num_tokens": 320007855.0, + "reward": 1.1138393878936768, + "reward_std": 0.39371177554130554, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6584821343421936, + "rewards/tag_count_reward/std": 0.3418850302696228, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1159.078125, + "completions/mean_terminated_length": 869.7840576171875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.10036758830110276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12421842783563647, + "kl": 0.01348876953125, + "learning_rate": 1e-06, + "loss": 0.1133, + "num_tokens": 320601650.0, + "reward": 1.215959906578064, + "reward_std": 0.42418786883354187, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6936383843421936, + "rewards/tag_count_reward/std": 0.3133472502231598, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1060.8951416015625, + "completions/mean_terminated_length": 871.875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.10058068296840872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3041596854087582, + "kl": 0.016754150390625, + "learning_rate": 9.999998754797222e-07, + "loss": 0.1243, + "num_tokens": 321145795.0, + "reward": 1.340959906578064, + "reward_std": 0.45829442143440247, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7806919813156128, + "rewards/tag_count_reward/std": 0.27228620648384094, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1109.046875, + "completions/mean_terminated_length": 749.6944580078125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.10079377763571466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13875230681146303, + "kl": 0.0123748779296875, + "learning_rate": 9.99999501918958e-07, + "loss": 0.0762, + "num_tokens": 321712344.0, + "reward": 1.1188616752624512, + "reward_std": 0.41439902782440186, + "rewards/accuracy_reward/mean": 0.47685185074806213, + "rewards/accuracy_reward/std": 0.5000429749488831, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6590401530265808, + "rewards/tag_count_reward/std": 0.31766021251678467, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1181.046875, + "completions/mean_terminated_length": 856.604248046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.10100687230302062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1323363674377381, + "kl": 0.01513671875, + "learning_rate": 9.999988793179141e-07, + "loss": 0.1139, + "num_tokens": 322309965.0, + "reward": 1.2170759439468384, + "reward_std": 0.47222140431404114, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6969866156578064, + "rewards/tag_count_reward/std": 0.32444560527801514, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1142.9866943359375, + "completions/mean_terminated_length": 855.5117797851562, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.10121996697032656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12387421477348093, + "kl": 0.013519287109375, + "learning_rate": 9.999980076769348e-07, + "loss": 0.1511, + "num_tokens": 322893591.0, + "reward": 1.2059152126312256, + "reward_std": 0.38405469059944153, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7059151530265808, + "rewards/tag_count_reward/std": 0.3287724554538727, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 996.1563110351562, + "completions/mean_terminated_length": 713.0821533203125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.10143306163763252, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15765461278053922, + "kl": 0.0155487060546875, + "learning_rate": 9.999968869965026e-07, + "loss": 0.1592, + "num_tokens": 323409021.0, + "reward": 1.20703125, + "reward_std": 0.3859425485134125, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7159598469734192, + "rewards/tag_count_reward/std": 0.31831979751586914, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1051.2232666015625, + "completions/mean_terminated_length": 817.8181762695312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.10164615630493846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13197322469146067, + "kl": 0.0146484375, + "learning_rate": 9.99995517277238e-07, + "loss": 0.1098, + "num_tokens": 323945057.0, + "reward": 1.2494419813156128, + "reward_std": 0.38459935784339905, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7516741156578064, + "rewards/tag_count_reward/std": 0.28304529190063477, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1154.66748046875, + "completions/mean_terminated_length": 831.547119140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.10185925097224442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1326367157254861, + "kl": 0.0151214599609375, + "learning_rate": 9.999938985198985e-07, + "loss": 0.1155, + "num_tokens": 324530812.0, + "reward": 1.1584821939468384, + "reward_std": 0.4268447160720825, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6964285969734192, + "rewards/tag_count_reward/std": 0.3131684362888336, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1078.15625, + "completions/mean_terminated_length": 841.0833740234375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.10207234563955037, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13267499481677836, + "kl": 0.0180511474609375, + "learning_rate": 9.999920307253804e-07, + "loss": 0.0967, + "num_tokens": 325078066.0, + "reward": 1.3738839626312256, + "reward_std": 0.4221484959125519, + "rewards/accuracy_reward/mean": 0.6319444179534912, + "rewards/accuracy_reward/std": 0.48283571004867554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7645089030265808, + "rewards/tag_count_reward/std": 0.31251198053359985, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1055.3660888671875, + "completions/mean_terminated_length": 812.7222290039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.10228544030685632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13259897750583752, + "kl": 0.015777587890625, + "learning_rate": 9.999899138947174e-07, + "loss": 0.0998, + "num_tokens": 325620182.0, + "reward": 1.2059152126312256, + "reward_std": 0.4361419975757599, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7327008843421936, + "rewards/tag_count_reward/std": 0.298405259847641, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1241.21875, + "completions/mean_terminated_length": 900.5778198242188, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.10249853497416227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11718697952367829, + "kl": 0.01312255859375, + "learning_rate": 9.999875480290809e-07, + "loss": 0.1043, + "num_tokens": 326246664.0, + "reward": 1.172991156578064, + "reward_std": 0.4758950173854828, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6886160969734192, + "rewards/tag_count_reward/std": 0.3223131597042084, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 999.1920166015625, + "completions/mean_terminated_length": 781.5148315429688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.10271162964146822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13993229328415177, + "kl": 0.016937255859375, + "learning_rate": 9.999849331297799e-07, + "loss": 0.1095, + "num_tokens": 326764654.0, + "reward": 1.3627232313156128, + "reward_std": 0.3963104486465454, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7533482313156128, + "rewards/tag_count_reward/std": 0.29958948493003845, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1074.4129638671875, + "completions/mean_terminated_length": 822.8118286132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.10292472430877417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41277423822743126, + "kl": 0.037567138671875, + "learning_rate": 9.99982069198262e-07, + "loss": 0.1098, + "num_tokens": 327313159.0, + "reward": 1.2533482313156128, + "reward_std": 0.4396021366119385, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7555803656578064, + "rewards/tag_count_reward/std": 0.30418795347213745, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1064.6629638671875, + "completions/mean_terminated_length": 732.9701538085938, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.10313781897608013, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16347744786689283, + "kl": 0.0197296142578125, + "learning_rate": 9.99978956236112e-07, + "loss": 0.1829, + "num_tokens": 327856560.0, + "reward": 1.1891741752624512, + "reward_std": 0.3950643837451935, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7137276530265808, + "rewards/tag_count_reward/std": 0.3189505934715271, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1045.34375, + "completions/mean_terminated_length": 764.5999755859375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.10335091364338607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4509870575701992, + "kl": 0.048583984375, + "learning_rate": 9.999755942450525e-07, + "loss": 0.1119, + "num_tokens": 328396330.0, + "reward": 1.278459906578064, + "reward_std": 0.41523391008377075, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7271205186843872, + "rewards/tag_count_reward/std": 0.34091922640800476, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1137.279052734375, + "completions/mean_terminated_length": 788.7315063476562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.10356400831069203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1267572031800838, + "kl": 0.0137481689453125, + "learning_rate": 9.999719832269443e-07, + "loss": 0.0779, + "num_tokens": 328971063.0, + "reward": 1.23828125, + "reward_std": 0.4070073068141937, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7092633843421936, + "rewards/tag_count_reward/std": 0.3270745277404785, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1166.1451416015625, + "completions/mean_terminated_length": 828.6450805664062, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.10377710297799797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11671655775877127, + "kl": 0.01446533203125, + "learning_rate": 9.99968123183786e-07, + "loss": 0.1079, + "num_tokens": 329568664.0, + "reward": 1.1194196939468384, + "reward_std": 0.42910727858543396, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6908482313156128, + "rewards/tag_count_reward/std": 0.3424888253211975, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 987.9152221679688, + "completions/mean_terminated_length": 798.2158203125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.10399019764530393, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1431692286925934, + "kl": 0.0169525146484375, + "learning_rate": 9.999640141177135e-07, + "loss": 0.1129, + "num_tokens": 330083986.0, + "reward": 1.313616156578064, + "reward_std": 0.4264586567878723, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.2877199947834015, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1058.3795166015625, + "completions/mean_terminated_length": 762.9275512695312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.10420329231260987, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1453310242710074, + "kl": 0.0164642333984375, + "learning_rate": 9.999596560310011e-07, + "loss": 0.1413, + "num_tokens": 330627100.0, + "reward": 1.2639509439468384, + "reward_std": 0.4461327791213989, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7327008843421936, + "rewards/tag_count_reward/std": 0.30808794498443604, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1297.5335693359375, + "completions/mean_terminated_length": 880.607666015625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.10441638697991583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11376023971570538, + "kl": 0.0116119384765625, + "learning_rate": 9.999550489260604e-07, + "loss": 0.1491, + "num_tokens": 331289371.0, + "reward": 1.047991156578064, + "reward_std": 0.45523765683174133, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6439732313156128, + "rewards/tag_count_reward/std": 0.33682864904403687, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1158.3348388671875, + "completions/mean_terminated_length": 851.0931396484375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.10462948164722177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13143303938856157, + "kl": 0.01300048828125, + "learning_rate": 9.999501928054414e-07, + "loss": 0.1293, + "num_tokens": 331879953.0, + "reward": 1.180803656578064, + "reward_std": 0.49052533507347107, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6964285969734192, + "rewards/tag_count_reward/std": 0.3219740688800812, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1104.97998046875, + "completions/mean_terminated_length": 715.277587890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.10484257631452773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12751031398177987, + "kl": 0.0137939453125, + "learning_rate": 9.999450876718313e-07, + "loss": 0.1371, + "num_tokens": 332437768.0, + "reward": 1.2064732313156128, + "reward_std": 0.4385809898376465, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6930803656578064, + "rewards/tag_count_reward/std": 0.3603644073009491, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1027.2410888671875, + "completions/mean_terminated_length": 777.7222290039062, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.10505567098183367, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14254317945035394, + "kl": 0.0149688720703125, + "learning_rate": 9.999397335280558e-07, + "loss": 0.1304, + "num_tokens": 332967540.0, + "reward": 1.1121652126312256, + "reward_std": 0.3940357565879822, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7237723469734192, + "rewards/tag_count_reward/std": 0.29586759209632874, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1213.1875, + "completions/mean_terminated_length": 928.2515258789062, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.10526876564913963, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1218325779123523, + "kl": 0.0144805908203125, + "learning_rate": 9.999341303770773e-07, + "loss": 0.1287, + "num_tokens": 333573464.0, + "reward": 1.21484375, + "reward_std": 0.48583367466926575, + "rewards/accuracy_reward/mean": 0.5115740895271301, + "rewards/accuracy_reward/std": 0.5004456043243408, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7215401530265808, + "rewards/tag_count_reward/std": 0.32709363102912903, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1063.43310546875, + "completions/mean_terminated_length": 769.4898681640625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.10548186031644557, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14470465965765336, + "kl": 0.0158233642578125, + "learning_rate": 9.999282782219976e-07, + "loss": 0.1564, + "num_tokens": 334120170.0, + "reward": 1.223772406578064, + "reward_std": 0.4154900312423706, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7126116156578064, + "rewards/tag_count_reward/std": 0.3035444915294647, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1091.078125, + "completions/mean_terminated_length": 779.6538696289062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.10569495498375153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1272113650096219, + "kl": 0.01385498046875, + "learning_rate": 9.999221770660548e-07, + "loss": 0.1356, + "num_tokens": 334676637.0, + "reward": 1.1941964626312256, + "reward_std": 0.4048629105091095, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6852678656578064, + "rewards/tag_count_reward/std": 0.31639668345451355, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1254.571533203125, + "completions/mean_terminated_length": 937.2000122070312, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.10590804965105748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11722924563048634, + "kl": 0.01226806640625, + "learning_rate": 9.999158269126255e-07, + "loss": 0.1144, + "num_tokens": 335307677.0, + "reward": 1.176897406578064, + "reward_std": 0.478923499584198, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6746651530265808, + "rewards/tag_count_reward/std": 0.34399595856666565, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1258.993408203125, + "completions/mean_terminated_length": 877.552978515625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.10612114431836343, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11749255523740923, + "kl": 0.013092041015625, + "learning_rate": 9.999092277652242e-07, + "loss": 0.1208, + "num_tokens": 335935226.0, + "reward": 1.1875, + "reward_std": 0.3897707164287567, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6941964030265808, + "rewards/tag_count_reward/std": 0.3523011803627014, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1086.515625, + "completions/mean_terminated_length": 813.773681640625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.10633423898566938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13820285403362007, + "kl": 0.014739990234375, + "learning_rate": 9.99902379627503e-07, + "loss": 0.151, + "num_tokens": 336490241.0, + "reward": 1.2142857313156128, + "reward_std": 0.42686742544174194, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.3142428994178772, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1049.884033203125, + "completions/mean_terminated_length": 795.4622192382812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.10654733365297533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38228978955619164, + "kl": 0.0238800048828125, + "learning_rate": 9.99895282503252e-07, + "loss": 0.1193, + "num_tokens": 337027773.0, + "reward": 1.1908482313156128, + "reward_std": 0.44005751609802246, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7310267686843872, + "rewards/tag_count_reward/std": 0.2952408790588379, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1009.3192138671875, + "completions/mean_terminated_length": 714.6791381835938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.10676042832028128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15334362466840795, + "kl": 0.015472412109375, + "learning_rate": 9.998879363963983e-07, + "loss": 0.2147, + "num_tokens": 337549964.0, + "reward": 1.2622768878936768, + "reward_std": 0.44516825675964355, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7310267686843872, + "rewards/tag_count_reward/std": 0.3059394359588623, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1176.57373046875, + "completions/mean_terminated_length": 824.1786499023438, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.10697352298758724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15620953685457023, + "kl": 0.0352935791015625, + "learning_rate": 9.99880341311008e-07, + "loss": 0.1333, + "num_tokens": 338151773.0, + "reward": 1.1629464626312256, + "reward_std": 0.38759276270866394, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6941964030265808, + "rewards/tag_count_reward/std": 0.3368823826313019, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1075.3482666015625, + "completions/mean_terminated_length": 711.3496704101562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.1071866176548932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13664039744713277, + "kl": 0.01177978515625, + "learning_rate": 9.998724972512838e-07, + "loss": 0.1406, + "num_tokens": 338705417.0, + "reward": 0.9782366752624512, + "reward_std": 0.34112757444381714, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6501116156578064, + "rewards/tag_count_reward/std": 0.3228525221347809, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1134.024658203125, + "completions/mean_terminated_length": 822.0689086914062, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.10739971232219914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11694709474631049, + "kl": 0.0136566162109375, + "learning_rate": 9.998644042215675e-07, + "loss": 0.1044, + "num_tokens": 339293860.0, + "reward": 1.07421875, + "reward_std": 0.3680979609489441, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6791294813156128, + "rewards/tag_count_reward/std": 0.3392232060432434, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1118.4910888671875, + "completions/mean_terminated_length": 804.9552001953125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1076128069895051, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11700136495266426, + "kl": 0.01434326171875, + "learning_rate": 9.998560622263376e-07, + "loss": 0.0674, + "num_tokens": 339860240.0, + "reward": 1.2890625, + "reward_std": 0.41393178701400757, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7533482313156128, + "rewards/tag_count_reward/std": 0.30696600675582886, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1138.8013916015625, + "completions/mean_terminated_length": 817.4229736328125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.10782590165681104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12451529046004653, + "kl": 0.0135040283203125, + "learning_rate": 9.998474712702108e-07, + "loss": 0.1283, + "num_tokens": 340444151.0, + "reward": 1.2790179252624512, + "reward_std": 0.40313443541526794, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.31778252124786377, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 999.482177734375, + "completions/mean_terminated_length": 721.0621337890625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.108038996324117, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15103539730126336, + "kl": 0.0154266357421875, + "learning_rate": 9.998386313579417e-07, + "loss": 0.1696, + "num_tokens": 340967183.0, + "reward": 1.1227679252624512, + "reward_std": 0.3666459619998932, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7142857313156128, + "rewards/tag_count_reward/std": 0.31124910712242126, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1067.953125, + "completions/mean_terminated_length": 771.6598510742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.10825209099142294, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.8341277634904317, + "kl": 0.02423095703125, + "learning_rate": 9.998295424944222e-07, + "loss": 0.1571, + "num_tokens": 341517898.0, + "reward": 1.1540179252624512, + "reward_std": 0.3726150095462799, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6941964030265808, + "rewards/tag_count_reward/std": 0.32159388065338135, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1076.337158203125, + "completions/mean_terminated_length": 771.4457397460938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1084651856587289, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12928863970447124, + "kl": 0.01434326171875, + "learning_rate": 9.998202046846825e-07, + "loss": 0.1443, + "num_tokens": 342068881.0, + "reward": 1.2008929252624512, + "reward_std": 0.4575641453266144, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7209821343421936, + "rewards/tag_count_reward/std": 0.3268306255340576, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1073.622802734375, + "completions/mean_terminated_length": 811.3966064453125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.10867828032603484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14403471799927503, + "kl": 0.015899658203125, + "learning_rate": 9.998106179338903e-07, + "loss": 0.1677, + "num_tokens": 342620728.0, + "reward": 1.2935268878936768, + "reward_std": 0.44815048575401306, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.2971627712249756, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1084.7857666015625, + "completions/mean_terminated_length": 744.314208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1088913749933408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13609437807517724, + "kl": 0.0151519775390625, + "learning_rate": 9.99800782247351e-07, + "loss": 0.1243, + "num_tokens": 343173864.0, + "reward": 1.1752232313156128, + "reward_std": 0.4100801646709442, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7176339030265808, + "rewards/tag_count_reward/std": 0.3147410750389099, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1065.029052734375, + "completions/mean_terminated_length": 821.33984375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.10910446966064674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12795470144425597, + "kl": 0.0147247314453125, + "learning_rate": 9.997906976305082e-07, + "loss": 0.1238, + "num_tokens": 343717493.0, + "reward": 1.2215402126312256, + "reward_std": 0.4134916663169861, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7572544813156128, + "rewards/tag_count_reward/std": 0.29267311096191406, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1140.0826416015625, + "completions/mean_terminated_length": 796.4707641601562, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1093175643279527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12921375300348603, + "kl": 0.0121002197265625, + "learning_rate": 9.997803640889428e-07, + "loss": 0.0828, + "num_tokens": 344305290.0, + "reward": 1.1049107313156128, + "reward_std": 0.412174791097641, + "rewards/accuracy_reward/mean": 0.39814814925193787, + "rewards/accuracy_reward/std": 0.49008384346961975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7209821343421936, + "rewards/tag_count_reward/std": 0.31194621324539185, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1014.450927734375, + "completions/mean_terminated_length": 754.6201171875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.10953065899525864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14095531891109297, + "kl": 0.016845703125, + "learning_rate": 9.997697816283734e-07, + "loss": 0.1177, + "num_tokens": 344837860.0, + "reward": 1.3264509439468384, + "reward_std": 0.45388615131378174, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7728794813156128, + "rewards/tag_count_reward/std": 0.28849291801452637, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1111.3504638671875, + "completions/mean_terminated_length": 817.4457397460938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1097437536625646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13073510604903013, + "kl": 0.0164642333984375, + "learning_rate": 9.997589502546572e-07, + "loss": 0.1327, + "num_tokens": 345403857.0, + "reward": 1.2645089626312256, + "reward_std": 0.3639766275882721, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7533482313156128, + "rewards/tag_count_reward/std": 0.3105885982513428, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1125.419677734375, + "completions/mean_terminated_length": 810.5269775390625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.10995684832987054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12461563808646403, + "kl": 0.0165557861328125, + "learning_rate": 9.997478699737879e-07, + "loss": 0.1137, + "num_tokens": 345979725.0, + "reward": 1.2477679252624512, + "reward_std": 0.3813445270061493, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7321428656578064, + "rewards/tag_count_reward/std": 0.34019383788108826, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1050.01123046875, + "completions/mean_terminated_length": 755.8063354492188, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1101699429971765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14330925751302245, + "kl": 0.01873779296875, + "learning_rate": 9.997365407918978e-07, + "loss": 0.1196, + "num_tokens": 346518402.0, + "reward": 1.1930804252624512, + "reward_std": 0.4103604555130005, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.30229419469833374, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1137.90185546875, + "completions/mean_terminated_length": 886.3931274414062, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.11038303766448244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13847372616222797, + "kl": 0.01666259765625, + "learning_rate": 9.99724962715257e-07, + "loss": 0.0695, + "num_tokens": 347099766.0, + "reward": 1.3543527126312256, + "reward_std": 0.41691386699676514, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.3035238981246948, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1168.560302734375, + "completions/mean_terminated_length": 868.3922729492188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.1105961323317884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1235694863220792, + "kl": 0.0141143798828125, + "learning_rate": 9.997131357502726e-07, + "loss": 0.0638, + "num_tokens": 347693329.0, + "reward": 1.1422991752624512, + "reward_std": 0.4199191629886627, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7003348469734192, + "rewards/tag_count_reward/std": 0.3215157389640808, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1343.1875, + "completions/mean_terminated_length": 895.6058349609375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.11080922699909435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11935521466152466, + "kl": 0.013671875, + "learning_rate": 9.9970105990349e-07, + "loss": 0.1195, + "num_tokens": 348368373.0, + "reward": 0.9793527126312256, + "reward_std": 0.4353788495063782, + "rewards/accuracy_reward/mean": 0.3370535671710968, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6422991156578064, + "rewards/tag_count_reward/std": 0.35628247261047363, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 984.075927734375, + "completions/mean_terminated_length": 716.6089477539062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.1110223216664003, + "frac_reward_zero_std": 0.0, + "grad_norm": 18.379072375856186, + "kl": 0.035186767578125, + "learning_rate": 9.99688735181593e-07, + "loss": 0.1555, + "num_tokens": 348881271.0, + "reward": 1.1266741752624512, + "reward_std": 0.3916095197200775, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6869419813156128, + "rewards/tag_count_reward/std": 0.3260883092880249, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1219.0179443359375, + "completions/mean_terminated_length": 826.3421020507812, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.11123541633370625, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16346793855284342, + "kl": 0.0158233642578125, + "learning_rate": 9.996761615914013e-07, + "loss": 0.1993, + "num_tokens": 349499199.0, + "reward": 1.0379464626312256, + "reward_std": 0.4653177857398987, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6651785969734192, + "rewards/tag_count_reward/std": 0.34685155749320984, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1081.5625, + "completions/mean_terminated_length": 782.0233764648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1114485110010122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15460437359577386, + "kl": 0.0168609619140625, + "learning_rate": 9.996633391398742e-07, + "loss": 0.1182, + "num_tokens": 350059067.0, + "reward": 1.3203125, + "reward_std": 0.43013155460357666, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7488839030265808, + "rewards/tag_count_reward/std": 0.33066415786743164, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1116.5960693359375, + "completions/mean_terminated_length": 859.1994018554688, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.11166160566831815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14815642598142983, + "kl": 0.0152740478515625, + "learning_rate": 9.996502678341075e-07, + "loss": 0.0672, + "num_tokens": 350627926.0, + "reward": 1.313616156578064, + "reward_std": 0.4055522084236145, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7287946343421936, + "rewards/tag_count_reward/std": 0.31701749563217163, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1168.4107666015625, + "completions/mean_terminated_length": 824.2236328125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.1118747003356241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12272438572086954, + "kl": 0.0148468017578125, + "learning_rate": 9.996369476813355e-07, + "loss": 0.1123, + "num_tokens": 351219918.0, + "reward": 1.2209821939468384, + "reward_std": 0.477637380361557, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7321428656578064, + "rewards/tag_count_reward/std": 0.3167831301689148, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1072.216552734375, + "completions/mean_terminated_length": 872.8629150390625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.11208779500293005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12197009621679969, + "kl": 0.0158538818359375, + "learning_rate": 9.996233786889298e-07, + "loss": 0.0793, + "num_tokens": 351767055.0, + "reward": 1.2918527126312256, + "reward_std": 0.4216061532497406, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.7583705186843872, + "rewards/tag_count_reward/std": 0.3034334182739258, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1204.357177734375, + "completions/mean_terminated_length": 929.798828125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.112300889670236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27005150137523415, + "kl": 0.01641845703125, + "learning_rate": 9.996095608643995e-07, + "loss": 0.1296, + "num_tokens": 352374495.0, + "reward": 1.1422991752624512, + "reward_std": 0.44155457615852356, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7137276530265808, + "rewards/tag_count_reward/std": 0.3198261260986328, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1173.34375, + "completions/mean_terminated_length": 856.978759765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.11251398433754195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1260793117385092, + "kl": 0.0139923095703125, + "learning_rate": 9.99595494215392e-07, + "loss": 0.118, + "num_tokens": 352966697.0, + "reward": 1.0385044813156128, + "reward_std": 0.41826698184013367, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6791294813156128, + "rewards/tag_count_reward/std": 0.3296067714691162, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 995.8839721679688, + "completions/mean_terminated_length": 842.5064086914062, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.11272707900484791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13190721492863094, + "kl": 0.01690673828125, + "learning_rate": 9.995811787496922e-07, + "loss": 0.0966, + "num_tokens": 353477653.0, + "reward": 1.3588169813156128, + "reward_std": 0.3607838749885559, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7873883843421936, + "rewards/tag_count_reward/std": 0.2729867994785309, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1087.779052734375, + "completions/mean_terminated_length": 804.7080688476562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.11294017367215385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15513465383177902, + "kl": 0.0165252685546875, + "learning_rate": 9.995666144752225e-07, + "loss": 0.1708, + "num_tokens": 354035266.0, + "reward": 1.2075893878936768, + "reward_std": 0.37594375014305115, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7120535969734192, + "rewards/tag_count_reward/std": 0.3185359239578247, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1130.325927734375, + "completions/mean_terminated_length": 876.7236328125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.11315326833945981, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12860773206909873, + "kl": 0.01416015625, + "learning_rate": 9.99551801400043e-07, + "loss": 0.1314, + "num_tokens": 354615556.0, + "reward": 1.2767857313156128, + "reward_std": 0.4374081492424011, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.31750741600990295, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1130.212158203125, + "completions/mean_terminated_length": 775.0309448242188, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.11336636300676575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13410960701045807, + "kl": 0.014862060546875, + "learning_rate": 9.995367395323516e-07, + "loss": 0.1359, + "num_tokens": 355181219.0, + "reward": 1.157366156578064, + "reward_std": 0.3704836964607239, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6908482313156128, + "rewards/tag_count_reward/std": 0.32703492045402527, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1196.58935546875, + "completions/mean_terminated_length": 817.5741577148438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.11357945767407171, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11653091325140572, + "kl": 0.0120086669921875, + "learning_rate": 9.995214288804841e-07, + "loss": 0.12, + "num_tokens": 355784155.0, + "reward": 1.1171875, + "reward_std": 0.4092658758163452, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6729910969734192, + "rewards/tag_count_reward/std": 0.34178823232650757, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1075.0692138671875, + "completions/mean_terminated_length": 879.439697265625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.11379255234137765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12166022349041769, + "kl": 0.0179443359375, + "learning_rate": 9.995058694529135e-07, + "loss": 0.1029, + "num_tokens": 356333994.0, + "reward": 1.4642857313156128, + "reward_std": 0.44063979387283325, + "rewards/accuracy_reward/mean": 0.6897321343421936, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7745535969734192, + "rewards/tag_count_reward/std": 0.2953021824359894, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1121.265625, + "completions/mean_terminated_length": 919.8016357421875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.11400564700868361, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7425105937638388, + "kl": 0.092041015625, + "learning_rate": 9.99490061258251e-07, + "loss": 0.1163, + "num_tokens": 356912545.0, + "reward": 1.329241156578064, + "reward_std": 0.47690925002098083, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7198660969734192, + "rewards/tag_count_reward/std": 0.3158497214317322, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1282.638427734375, + "completions/mean_terminated_length": 927.4705810546875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.11421874167598955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1333996275925524, + "kl": 0.0131988525390625, + "learning_rate": 9.994740043052451e-07, + "loss": 0.0849, + "num_tokens": 357565855.0, + "reward": 1.040178656578064, + "reward_std": 0.40426236391067505, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.33617010712623596, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1285.852783203125, + "completions/mean_terminated_length": 928.5180053710938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.11443183634329551, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12787905799468002, + "kl": 0.0139007568359375, + "learning_rate": 9.99457698602782e-07, + "loss": 0.141, + "num_tokens": 358212349.0, + "reward": 1.1077009439468384, + "reward_std": 0.4428555369377136, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.6746651530265808, + "rewards/tag_count_reward/std": 0.3332604467868805, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 994.2835083007812, + "completions/mean_terminated_length": 792.5079345703125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.11464493101060146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14556751045745503, + "kl": 0.0163421630859375, + "learning_rate": 9.994411441598858e-07, + "loss": 0.1171, + "num_tokens": 358726508.0, + "reward": 1.2075893878936768, + "reward_std": 0.39020565152168274, + "rewards/accuracy_reward/mean": 0.4791666567325592, + "rewards/accuracy_reward/std": 0.5001450181007385, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7455357313156128, + "rewards/tag_count_reward/std": 0.3000412583351135, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1054.85498046875, + "completions/mean_terminated_length": 861.5226440429688, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.11485802567790741, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13246574499671018, + "kl": 0.0164947509765625, + "learning_rate": 9.994243409857184e-07, + "loss": 0.1355, + "num_tokens": 359273643.0, + "reward": 1.1802456378936768, + "reward_std": 0.37442320585250854, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.7315848469734192, + "rewards/tag_count_reward/std": 0.2902616858482361, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1084.305908203125, + "completions/mean_terminated_length": 792.9564208984375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.11507112034521336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14541726033840127, + "kl": 0.0151214599609375, + "learning_rate": 9.994072890895786e-07, + "loss": 0.1289, + "num_tokens": 359832708.0, + "reward": 1.0943081378936768, + "reward_std": 0.4312390685081482, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6947544813156128, + "rewards/tag_count_reward/std": 0.3253639042377472, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1069.6116943359375, + "completions/mean_terminated_length": 802.7784423828125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.11528421501251931, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13979966313418196, + "kl": 0.0152130126953125, + "learning_rate": 9.993899884809032e-07, + "loss": 0.1432, + "num_tokens": 360374870.0, + "reward": 1.1389509439468384, + "reward_std": 0.41030871868133545, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6880580186843872, + "rewards/tag_count_reward/std": 0.32285642623901367, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1106.446533203125, + "completions/mean_terminated_length": 788.8477783203125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.11549730967982526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22971373969713504, + "kl": 0.018524169921875, + "learning_rate": 9.993724391692675e-07, + "loss": 0.1045, + "num_tokens": 360934878.0, + "reward": 1.2349331378936768, + "reward_std": 0.3717692792415619, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.32729583978652954, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1030.2054443359375, + "completions/mean_terminated_length": 767.1798095703125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.11571040434713122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1429312777871901, + "kl": 0.02386474609375, + "learning_rate": 9.993546411643828e-07, + "loss": 0.1664, + "num_tokens": 361466442.0, + "reward": 1.2912946939468384, + "reward_std": 0.36561089754104614, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7243303656578064, + "rewards/tag_count_reward/std": 0.309089720249176, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1215.1138916015625, + "completions/mean_terminated_length": 881.9594116210938, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.11592349901443716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12404947796649639, + "kl": 0.013031005859375, + "learning_rate": 9.993365944760997e-07, + "loss": 0.1157, + "num_tokens": 362085661.0, + "reward": 1.15234375, + "reward_std": 0.4340263307094574, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.33053719997406006, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1044.35498046875, + "completions/mean_terminated_length": 792.0418701171875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.11613659368174312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1293230117473075, + "kl": 0.015289306640625, + "learning_rate": 9.993182991144052e-07, + "loss": 0.1189, + "num_tokens": 362618428.0, + "reward": 1.2516741752624512, + "reward_std": 0.40109357237815857, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.30781227350234985, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1100.3192138671875, + "completions/mean_terminated_length": 865.3788452148438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.11634968834904906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1279928632691155, + "kl": 0.0162200927734375, + "learning_rate": 9.992997550894246e-07, + "loss": 0.1052, + "num_tokens": 363177515.0, + "reward": 1.266741156578064, + "reward_std": 0.40209218859672546, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7176339030265808, + "rewards/tag_count_reward/std": 0.31783556938171387, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1085.5357666015625, + "completions/mean_terminated_length": 843.5753784179688, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.11656278301635502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1333956759794662, + "kl": 0.01458740234375, + "learning_rate": 9.992809624114205e-07, + "loss": 0.1045, + "num_tokens": 363726811.0, + "reward": 1.2427456378936768, + "reward_std": 0.4335285723209381, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7137276530265808, + "rewards/tag_count_reward/std": 0.30734148621559143, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1170.7835693359375, + "completions/mean_terminated_length": 823.7227172851562, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.11677587768366096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12050974634552532, + "kl": 0.014495849609375, + "learning_rate": 9.992619210907934e-07, + "loss": 0.1517, + "num_tokens": 364320474.0, + "reward": 1.204241156578064, + "reward_std": 0.4439477324485779, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6752232313156128, + "rewards/tag_count_reward/std": 0.3255350887775421, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1095.7545166015625, + "completions/mean_terminated_length": 885.5858154296875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.11698897235096692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11787082270924935, + "kl": 0.013092041015625, + "learning_rate": 9.992426311380808e-07, + "loss": 0.1024, + "num_tokens": 364884220.0, + "reward": 1.1590402126312256, + "reward_std": 0.4144341051578522, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7215401530265808, + "rewards/tag_count_reward/std": 0.30725616216659546, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1016.9375610351562, + "completions/mean_terminated_length": 754.11767578125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.11720206701827286, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14411149970135298, + "kl": 0.0152435302734375, + "learning_rate": 9.992230925639584e-07, + "loss": 0.1037, + "num_tokens": 365408256.0, + "reward": 1.2248884439468384, + "reward_std": 0.3508469760417938, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7248883843421936, + "rewards/tag_count_reward/std": 0.28193163871765137, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1160.055908203125, + "completions/mean_terminated_length": 827.7576293945312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.11741516168557882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1260065380952364, + "kl": 0.0141754150390625, + "learning_rate": 9.992033053792397e-07, + "loss": 0.1072, + "num_tokens": 366000169.0, + "reward": 1.05859375, + "reward_std": 0.4481099247932434, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6858258843421936, + "rewards/tag_count_reward/std": 0.32111555337905884, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1118.296875, + "completions/mean_terminated_length": 797.228271484375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.11762825635288476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12346759248171382, + "kl": 0.0135345458984375, + "learning_rate": 9.991832695948747e-07, + "loss": 0.117, + "num_tokens": 366566158.0, + "reward": 1.0814732313156128, + "reward_std": 0.3837091326713562, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.32781273126602173, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1055.450927734375, + "completions/mean_terminated_length": 812.8278198242188, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.11784135102019072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14131325662348282, + "kl": 0.0155487060546875, + "learning_rate": 9.991629852219523e-07, + "loss": 0.128, + "num_tokens": 367104088.0, + "reward": 1.1489956378936768, + "reward_std": 0.4467398226261139, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.308691143989563, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 884.82373046875, + "completions/mean_terminated_length": 745.2424926757812, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.11805444568749666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25119218626675094, + "kl": 0.02606201171875, + "learning_rate": 9.991424522716978e-07, + "loss": 0.0535, + "num_tokens": 367568553.0, + "reward": 1.3816964626312256, + "reward_std": 0.34707847237586975, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7857142686843872, + "rewards/tag_count_reward/std": 0.27448779344558716, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1129.6273193359375, + "completions/mean_terminated_length": 869.1146240234375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.11826754035480262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11701143350943649, + "kl": 0.01519775390625, + "learning_rate": 9.99121670755475e-07, + "loss": 0.1171, + "num_tokens": 368145570.0, + "reward": 1.2075893878936768, + "reward_std": 0.46509990096092224, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7142857313156128, + "rewards/tag_count_reward/std": 0.31879445910453796, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1064.529052734375, + "completions/mean_terminated_length": 857.2026977539062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.11848063502210857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12077757181883189, + "kl": 0.0151519775390625, + "learning_rate": 9.99100640684785e-07, + "loss": 0.0744, + "num_tokens": 368693471.0, + "reward": 1.1813616752624512, + "reward_std": 0.4389062821865082, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7349330186843872, + "rewards/tag_count_reward/std": 0.2856007516384125, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1063.118408203125, + "completions/mean_terminated_length": 842.4617309570312, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.11869372968941452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13249580267094713, + "kl": 0.01611328125, + "learning_rate": 9.990793620712657e-07, + "loss": 0.0958, + "num_tokens": 369240996.0, + "reward": 1.2924107313156128, + "reward_std": 0.4899755120277405, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.297794371843338, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1157.19873046875, + "completions/mean_terminated_length": 796.9686279296875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.11890682435672047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13194843495605557, + "kl": 0.014251708984375, + "learning_rate": 9.990578349266939e-07, + "loss": 0.0993, + "num_tokens": 369834253.0, + "reward": 1.0853794813156128, + "reward_std": 0.3956058919429779, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.31494081020355225, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1127.8951416015625, + "completions/mean_terminated_length": 853.1971435546875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.11911991902402642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12547571916045897, + "kl": 0.015045166015625, + "learning_rate": 9.990360592629827e-07, + "loss": 0.1011, + "num_tokens": 370403822.0, + "reward": 1.2109375, + "reward_std": 0.4083310663700104, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7310267686843872, + "rewards/tag_count_reward/std": 0.314503014087677, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 993.19873046875, + "completions/mean_terminated_length": 728.025146484375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.11933301369133237, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14974636872820007, + "kl": 0.01580810546875, + "learning_rate": 9.990140350921837e-07, + "loss": 0.0939, + "num_tokens": 370913143.0, + "reward": 1.231584906578064, + "reward_std": 0.3409159779548645, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7137276530265808, + "rewards/tag_count_reward/std": 0.2846681475639343, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1152.732177734375, + "completions/mean_terminated_length": 868.3529663085938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.11954610835863833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.129940523584444, + "kl": 0.0146484375, + "learning_rate": 9.989917624264854e-07, + "loss": 0.115, + "num_tokens": 371495935.0, + "reward": 1.0859375, + "reward_std": 0.4487876892089844, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6930803656578064, + "rewards/tag_count_reward/std": 0.29031166434288025, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1071.341552734375, + "completions/mean_terminated_length": 822.389404296875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.11975920302594427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14072939191506276, + "kl": 0.013671875, + "learning_rate": 9.989692412782137e-07, + "loss": 0.1141, + "num_tokens": 372048632.0, + "reward": 1.1400669813156128, + "reward_std": 0.43616780638694763, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6958705186843872, + "rewards/tag_count_reward/std": 0.3065280020236969, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1151.950927734375, + "completions/mean_terminated_length": 816.61962890625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.11997229769325023, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13449460635395685, + "kl": 0.01605224609375, + "learning_rate": 9.989464716598327e-07, + "loss": 0.1255, + "num_tokens": 372628130.0, + "reward": 1.2639509439468384, + "reward_std": 0.4298105239868164, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6947544813156128, + "rewards/tag_count_reward/std": 0.33924898505210876, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1065.118408203125, + "completions/mean_terminated_length": 800.6033935546875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.12018539236055617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13335179386134763, + "kl": 0.0149993896484375, + "learning_rate": 9.989234535839436e-07, + "loss": 0.1432, + "num_tokens": 373182423.0, + "reward": 1.2114956378936768, + "reward_std": 0.4202878773212433, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.30598992109298706, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1239.9598388671875, + "completions/mean_terminated_length": 967.3970336914062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.12039848702786213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11739770586083305, + "kl": 0.0137481689453125, + "learning_rate": 9.989001870632852e-07, + "loss": 0.0446, + "num_tokens": 373805221.0, + "reward": 1.2516741752624512, + "reward_std": 0.41214022040367126, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7159598469734192, + "rewards/tag_count_reward/std": 0.3291178047657013, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1156.4888916015625, + "completions/mean_terminated_length": 855.7701416015625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.12061158169516809, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13006548620854622, + "kl": 0.0133819580078125, + "learning_rate": 9.988766721107336e-07, + "loss": 0.1195, + "num_tokens": 374397952.0, + "reward": 1.079241156578064, + "reward_std": 0.39850112795829773, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6997767686843872, + "rewards/tag_count_reward/std": 0.31505823135375977, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1090.84375, + "completions/mean_terminated_length": 805.0841064453125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.12082467636247403, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14112731925731012, + "kl": 0.015106201171875, + "learning_rate": 9.988529087393026e-07, + "loss": 0.1089, + "num_tokens": 374963338.0, + "reward": 1.157366156578064, + "reward_std": 0.38730138540267944, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6975446343421936, + "rewards/tag_count_reward/std": 0.307956725358963, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1008.2053833007812, + "completions/mean_terminated_length": 709.413818359375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.12103777102977999, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12950012178344686, + "kl": 0.0142669677734375, + "learning_rate": 9.988288969621433e-07, + "loss": 0.167, + "num_tokens": 375482374.0, + "reward": 1.1099331378936768, + "reward_std": 0.3620772957801819, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6947544813156128, + "rewards/tag_count_reward/std": 0.299870103597641, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 963.997802734375, + "completions/mean_terminated_length": 799.5861206054688, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.12125086569708593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14305398855083126, + "kl": 0.016510009765625, + "learning_rate": 9.988046367925445e-07, + "loss": 0.0634, + "num_tokens": 375986437.0, + "reward": 1.3404018878936768, + "reward_std": 0.35928723216056824, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7622767686843872, + "rewards/tag_count_reward/std": 0.2667539417743683, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1238.15185546875, + "completions/mean_terminated_length": 892.5477905273438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.12146396036439189, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11384838146082502, + "kl": 0.014556884765625, + "learning_rate": 9.987801282439321e-07, + "loss": 0.1299, + "num_tokens": 376613305.0, + "reward": 1.1780134439468384, + "reward_std": 0.4377499520778656, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6891741156578064, + "rewards/tag_count_reward/std": 0.3303444981575012, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1012.30810546875, + "completions/mean_terminated_length": 842.8311767578125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.12167705503169783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14804857877856392, + "kl": 0.0182952880859375, + "learning_rate": 9.987553713298703e-07, + "loss": 0.1462, + "num_tokens": 377133331.0, + "reward": 1.4291294813156128, + "reward_std": 0.40965503454208374, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7996651530265808, + "rewards/tag_count_reward/std": 0.27764368057250977, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1148.6875, + "completions/mean_terminated_length": 827.1151123046875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.12189014969900379, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1202158473366826, + "kl": 0.015350341796875, + "learning_rate": 9.987303660640595e-07, + "loss": 0.1102, + "num_tokens": 377724391.0, + "reward": 1.3510044813156128, + "reward_std": 0.457707017660141, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7438616156578064, + "rewards/tag_count_reward/std": 0.31523799896240234, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 956.9888916015625, + "completions/mean_terminated_length": 761.7553100585938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.12210324436630973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13732163923228674, + "kl": 0.01849365234375, + "learning_rate": 9.987051124603385e-07, + "loss": 0.0914, + "num_tokens": 378218226.0, + "reward": 1.4330357313156128, + "reward_std": 0.370051771402359, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7946428656578064, + "rewards/tag_count_reward/std": 0.2752145528793335, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1158.962158203125, + "completions/mean_terminated_length": 910.0314331054688, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.12231633903361569, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11676670658941553, + "kl": 0.014923095703125, + "learning_rate": 9.986796105326831e-07, + "loss": 0.1224, + "num_tokens": 378810577.0, + "reward": 1.3504464626312256, + "reward_std": 0.43793973326683044, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7544642686843872, + "rewards/tag_count_reward/std": 0.3181358873844147, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1134.122802734375, + "completions/mean_terminated_length": 840.2802124023438, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.12252943370092163, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13521602211336053, + "kl": 0.0147247314453125, + "learning_rate": 9.98653860295207e-07, + "loss": 0.1528, + "num_tokens": 379390984.0, + "reward": 1.2126116752624512, + "reward_std": 0.3841191828250885, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396106719971, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.29306527972221375, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1142.196533203125, + "completions/mean_terminated_length": 857.970703125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.12274252836822759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1270612330381728, + "kl": 0.0153961181640625, + "learning_rate": 9.986278617621607e-07, + "loss": 0.1305, + "num_tokens": 379970400.0, + "reward": 1.0943081378936768, + "reward_std": 0.42143312096595764, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7081473469734192, + "rewards/tag_count_reward/std": 0.31914231181144714, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 905.5402221679688, + "completions/mean_terminated_length": 683.1412963867188, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.12295562303553353, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1563940452782182, + "kl": 0.0179443359375, + "learning_rate": 9.986016149479323e-07, + "loss": 0.1285, + "num_tokens": 380448386.0, + "reward": 1.31640625, + "reward_std": 0.3732053339481354, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7583705186843872, + "rewards/tag_count_reward/std": 0.2733752429485321, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1091.85498046875, + "completions/mean_terminated_length": 880.8256225585938, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.12316871770283949, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1324270800302498, + "kl": 0.0157928466796875, + "learning_rate": 9.985751198670474e-07, + "loss": 0.0805, + "num_tokens": 381009569.0, + "reward": 1.3125, + "reward_std": 0.46132099628448486, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7522321343421936, + "rewards/tag_count_reward/std": 0.3028491139411926, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1174.9888916015625, + "completions/mean_terminated_length": 894.2861328125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.12338181237014544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11677519099572714, + "kl": 0.0135955810546875, + "learning_rate": 9.985483765341695e-07, + "loss": 0.1068, + "num_tokens": 381609916.0, + "reward": 1.2578125, + "reward_std": 0.420994371175766, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7287946343421936, + "rewards/tag_count_reward/std": 0.3130228817462921, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1102.8170166015625, + "completions/mean_terminated_length": 891.0546264648438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.1235949070374514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13369918710314907, + "kl": 0.01806640625, + "learning_rate": 9.985213849640985e-07, + "loss": 0.075, + "num_tokens": 382172746.0, + "reward": 1.4436384439468384, + "reward_std": 0.3795551657676697, + "rewards/accuracy_reward/mean": 0.6629464030265808, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7806919813156128, + "rewards/tag_count_reward/std": 0.28532519936561584, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1150.1629638671875, + "completions/mean_terminated_length": 821.6859741210938, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.12380800170475734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1342163847248937, + "kl": 0.0158538818359375, + "learning_rate": 9.984941451717722e-07, + "loss": 0.1109, + "num_tokens": 382757251.0, + "reward": 1.2466518878936768, + "reward_std": 0.3976649045944214, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7555803656578064, + "rewards/tag_count_reward/std": 0.3145824074745178, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1115.4263916015625, + "completions/mean_terminated_length": 867.7937622070312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1240210963720633, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12076706629120385, + "kl": 0.0136566162109375, + "learning_rate": 9.984666571722663e-07, + "loss": 0.0955, + "num_tokens": 383325538.0, + "reward": 1.2204241752624512, + "reward_std": 0.3612907826900482, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7271205186843872, + "rewards/tag_count_reward/std": 0.30681297183036804, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1166.4398193359375, + "completions/mean_terminated_length": 926.0142211914062, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.12423419103936924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12239118289945367, + "kl": 0.015380859375, + "learning_rate": 9.984389209807924e-07, + "loss": 0.1026, + "num_tokens": 383923527.0, + "reward": 1.2661831378936768, + "reward_std": 0.44759076833724976, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.2930056154727936, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1133.8817138671875, + "completions/mean_terminated_length": 803.2431640625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1244472857066752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13459010859580972, + "kl": 0.0155029296875, + "learning_rate": 9.98410936612701e-07, + "loss": 0.1256, + "num_tokens": 384501090.0, + "reward": 1.1350446939468384, + "reward_std": 0.41443201899528503, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6975446343421936, + "rewards/tag_count_reward/std": 0.3204174339771271, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1204.6160888671875, + "completions/mean_terminated_length": 923.4880981445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.12466038037398114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27717277354516673, + "kl": 0.0333251953125, + "learning_rate": 9.983827040834791e-07, + "loss": 0.098, + "num_tokens": 385112486.0, + "reward": 1.2756696939468384, + "reward_std": 0.4520913064479828, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7488839030265808, + "rewards/tag_count_reward/std": 0.3159603774547577, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1139.3348388671875, + "completions/mean_terminated_length": 898.0508422851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1248734750412871, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1265985054354985, + "kl": 0.015350341796875, + "learning_rate": 9.983542234087511e-07, + "loss": 0.0929, + "num_tokens": 385688828.0, + "reward": 1.2377232313156128, + "reward_std": 0.3778587579727173, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7154017686843872, + "rewards/tag_count_reward/std": 0.3197934329509735, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1244.435302734375, + "completions/mean_terminated_length": 919.4827270507812, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.12508656970859305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11671678938713659, + "kl": 0.0145721435546875, + "learning_rate": 9.98325494604279e-07, + "loss": 0.0872, + "num_tokens": 386313375.0, + "reward": 1.1640625, + "reward_std": 0.43318504095077515, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7020089030265808, + "rewards/tag_count_reward/std": 0.340558797121048, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 969.0714721679688, + "completions/mean_terminated_length": 762.4680786132812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.12529966437589898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1318291513333798, + "kl": 0.0173187255859375, + "learning_rate": 9.982965176859622e-07, + "loss": 0.0864, + "num_tokens": 386811855.0, + "reward": 1.3286831378936768, + "reward_std": 0.4062252938747406, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.3179234266281128, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1119.0625, + "completions/mean_terminated_length": 852.1264038085938, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.12551275904320494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1303657070856923, + "kl": 0.018035888671875, + "learning_rate": 9.98267292669837e-07, + "loss": 0.0709, + "num_tokens": 387387227.0, + "reward": 1.2756696939468384, + "reward_std": 0.37614575028419495, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7511160969734192, + "rewards/tag_count_reward/std": 0.293952614068985, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1265.953125, + "completions/mean_terminated_length": 876.2374267578125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1257258537105109, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12088444046628818, + "kl": 0.014068603515625, + "learning_rate": 9.982378195720775e-07, + "loss": 0.0971, + "num_tokens": 388022710.0, + "reward": 1.114397406578064, + "reward_std": 0.45109930634498596, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6568080186843872, + "rewards/tag_count_reward/std": 0.3521164059638977, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1119.3460693359375, + "completions/mean_terminated_length": 835.0641479492188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.12593894837781686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13062892459143882, + "kl": 0.015625, + "learning_rate": 9.98208098408994e-07, + "loss": 0.1069, + "num_tokens": 388597329.0, + "reward": 1.2739956378936768, + "reward_std": 0.3894144892692566, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.31106001138687134, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1247.243408203125, + "completions/mean_terminated_length": 933.9037475585938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.12615204304512279, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9600791058869924, + "kl": 0.0240020751953125, + "learning_rate": 9.98178129197036e-07, + "loss": 0.0865, + "num_tokens": 389227262.0, + "reward": 1.1813616752624512, + "reward_std": 0.5229811072349548, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7193080186843872, + "rewards/tag_count_reward/std": 0.30245262384414673, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1244.5357666015625, + "completions/mean_terminated_length": 894.3076782226562, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.12636513771242874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12524629749491442, + "kl": 0.0147552490234375, + "learning_rate": 9.981479119527883e-07, + "loss": 0.1135, + "num_tokens": 389858910.0, + "reward": 1.110491156578064, + "reward_std": 0.45997804403305054, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6930803656578064, + "rewards/tag_count_reward/std": 0.33083024621009827, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1091.4263916015625, + "completions/mean_terminated_length": 809.4306030273438, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.1265782323797347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1390816633199968, + "kl": 0.0156097412109375, + "learning_rate": 9.981174466929742e-07, + "loss": 0.1027, + "num_tokens": 390414397.0, + "reward": 1.1400669813156128, + "reward_std": 0.4445279836654663, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7293526530265808, + "rewards/tag_count_reward/std": 0.29631030559539795, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1122.53125, + "completions/mean_terminated_length": 902.6685180664062, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.12679132704704066, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13768040920987817, + "kl": 0.0168304443359375, + "learning_rate": 9.980867334344539e-07, + "loss": 0.0998, + "num_tokens": 390989339.0, + "reward": 1.2360491752624512, + "reward_std": 0.37105876207351685, + "rewards/accuracy_reward/mean": 0.4930555522441864, + "rewards/accuracy_reward/std": 0.5005314350128174, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7606026530265808, + "rewards/tag_count_reward/std": 0.27936944365501404, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1103.9442138671875, + "completions/mean_terminated_length": 822.095703125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1270044217143466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1261107229382252, + "kl": 0.0153045654296875, + "learning_rate": 9.980557721942243e-07, + "loss": 0.1043, + "num_tokens": 391550530.0, + "reward": 1.3404018878936768, + "reward_std": 0.37021195888519287, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7600446343421936, + "rewards/tag_count_reward/std": 0.2966245412826538, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1397.227783203125, + "completions/mean_terminated_length": 960.1417846679688, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.12721751638165255, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11546365972495108, + "kl": 0.0106658935546875, + "learning_rate": 9.98024562989421e-07, + "loss": 0.1319, + "num_tokens": 392249480.0, + "reward": 0.8984375596046448, + "reward_std": 0.3560902774333954, + "rewards/accuracy_reward/mean": 0.3035714328289032, + "rewards/accuracy_reward/std": 0.46031373739242554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5948660969734192, + "rewards/tag_count_reward/std": 0.349874347448349, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1256.665283203125, + "completions/mean_terminated_length": 918.9617919921875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1274306110489585, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11326135773012093, + "kl": 0.0140380859375, + "learning_rate": 9.979931058373155e-07, + "loss": 0.0761, + "num_tokens": 392877746.0, + "reward": 1.1785714626312256, + "reward_std": 0.4073047935962677, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7142857313156128, + "rewards/tag_count_reward/std": 0.32228410243988037, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1079.763427734375, + "completions/mean_terminated_length": 822.6610107421875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.12764370571626446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12952250402107507, + "kl": 0.015594482421875, + "learning_rate": 9.979614007553166e-07, + "loss": 0.1066, + "num_tokens": 393426520.0, + "reward": 1.3459821939468384, + "reward_std": 0.4195391535758972, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7633928656578064, + "rewards/tag_count_reward/std": 0.30209797620773315, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1003.8638916015625, + "completions/mean_terminated_length": 854.7015380859375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.1278568003835704, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13740573734591055, + "kl": 0.018768310546875, + "learning_rate": 9.97929447760971e-07, + "loss": 0.0842, + "num_tokens": 393941323.0, + "reward": 1.6010044813156128, + "reward_std": 0.34866729378700256, + "rewards/accuracy_reward/mean": 0.7767857313156128, + "rewards/accuracy_reward/std": 0.41686633229255676, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.2626858651638031, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1078.90625, + "completions/mean_terminated_length": 855.2692260742188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.12806989505087635, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14318462971144025, + "kl": 0.017120361328125, + "learning_rate": 9.978972468719622e-07, + "loss": 0.1325, + "num_tokens": 394494945.0, + "reward": 1.328125, + "reward_std": 0.41569894552230835, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7767857313156128, + "rewards/tag_count_reward/std": 0.29652562737464905, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1201.274658203125, + "completions/mean_terminated_length": 912.2724609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1282829897181823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12690026125016113, + "kl": 0.0154266357421875, + "learning_rate": 9.978647981061108e-07, + "loss": 0.0603, + "num_tokens": 395105820.0, + "reward": 1.243303656578064, + "reward_std": 0.4231824278831482, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7254464030265808, + "rewards/tag_count_reward/std": 0.3254833221435547, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1264.3616943359375, + "completions/mean_terminated_length": 961.095947265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.12849608438548826, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11392107489248589, + "kl": 0.014801025390625, + "learning_rate": 9.978321014813748e-07, + "loss": 0.0765, + "num_tokens": 395741118.0, + "reward": 1.28125, + "reward_std": 0.4374000132083893, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7165178656578064, + "rewards/tag_count_reward/std": 0.31462404131889343, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 990.6272583007812, + "completions/mean_terminated_length": 774.6048583984375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1287091790527942, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12087826663143154, + "kl": 0.015380859375, + "learning_rate": 9.97799157015849e-07, + "loss": 0.065, + "num_tokens": 396252471.0, + "reward": 1.33203125, + "reward_std": 0.2973857820034027, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7873883843421936, + "rewards/tag_count_reward/std": 0.27553585171699524, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1200.091552734375, + "completions/mean_terminated_length": 930.7559204101562, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.12892227372010015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13386125926621395, + "kl": 0.01373291015625, + "learning_rate": 9.977659647277663e-07, + "loss": 0.0993, + "num_tokens": 396856832.0, + "reward": 1.1936384439468384, + "reward_std": 0.4064633548259735, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7181919813156128, + "rewards/tag_count_reward/std": 0.3150200843811035, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1033.8482666015625, + "completions/mean_terminated_length": 852.3684692382812, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1291353683874061, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1425421215203902, + "kl": 0.017364501953125, + "learning_rate": 9.977325246354956e-07, + "loss": 0.0829, + "num_tokens": 397381244.0, + "reward": 1.2907366752624512, + "reward_std": 0.41282179951667786, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.28231218457221985, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1201.227783203125, + "completions/mean_terminated_length": 880.7568969726562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.12934846305471207, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13011060426531068, + "kl": 0.015533447265625, + "learning_rate": 9.976988367575433e-07, + "loss": 0.0913, + "num_tokens": 397995362.0, + "reward": 1.140625, + "reward_std": 0.39326173067092896, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7388392686843872, + "rewards/tag_count_reward/std": 0.3044935464859009, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1276.247802734375, + "completions/mean_terminated_length": 939.8429565429688, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.129561557722018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12135026705591755, + "kl": 0.012969970703125, + "learning_rate": 9.976649011125534e-07, + "loss": 0.1089, + "num_tokens": 398641729.0, + "reward": 1.153459906578064, + "reward_std": 0.45584455132484436, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6958705186843872, + "rewards/tag_count_reward/std": 0.32981881499290466, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1050.046875, + "completions/mean_terminated_length": 858.949462890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.12977465238932395, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13980480078285165, + "kl": 0.015380859375, + "learning_rate": 9.976307177193067e-07, + "loss": 0.1079, + "num_tokens": 399185830.0, + "reward": 1.1473214626312256, + "reward_std": 0.39241477847099304, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7321428656578064, + "rewards/tag_count_reward/std": 0.3118901550769806, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1029.7701416015625, + "completions/mean_terminated_length": 773.7904663085938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1299877470566299, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13438484725231253, + "kl": 0.016448974609375, + "learning_rate": 9.975962865967208e-07, + "loss": 0.0733, + "num_tokens": 399715151.0, + "reward": 1.3443081378936768, + "reward_std": 0.35527294874191284, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7661830186843872, + "rewards/tag_count_reward/std": 0.3063283860683441, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1059.13623046875, + "completions/mean_terminated_length": 860.302978515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.13020084172393587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12661166257983653, + "kl": 0.014129638671875, + "learning_rate": 9.975616077638509e-07, + "loss": 0.0998, + "num_tokens": 400258268.0, + "reward": 1.2606027126312256, + "reward_std": 0.36463993787765503, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7739955186843872, + "rewards/tag_count_reward/std": 0.27652183175086975, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1124.2076416015625, + "completions/mean_terminated_length": 841.4140014648438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1304139363912418, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11263923560629889, + "kl": 0.01434326171875, + "learning_rate": 9.97526681239889e-07, + "loss": 0.0706, + "num_tokens": 400829353.0, + "reward": 1.219866156578064, + "reward_std": 0.3816168010234833, + "rewards/accuracy_reward/mean": 0.4861111044883728, + "rewards/accuracy_reward/std": 0.5003865361213684, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7511160969734192, + "rewards/tag_count_reward/std": 0.31015416979789734, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1258.325927734375, + "completions/mean_terminated_length": 942.4562377929688, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.13062703105854775, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13944191597400596, + "kl": 0.0150146484375, + "learning_rate": 9.974915070441643e-07, + "loss": 0.0951, + "num_tokens": 401462347.0, + "reward": 1.1858259439468384, + "reward_std": 0.5252411365509033, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.3229955732822418, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1057.4420166015625, + "completions/mean_terminated_length": 769.1239013671875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.1308401257258537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14184457125869523, + "kl": 0.0157623291015625, + "learning_rate": 9.974560851961428e-07, + "loss": 0.1277, + "num_tokens": 402008417.0, + "reward": 1.2566964626312256, + "reward_std": 0.38438650965690613, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7477678656578064, + "rewards/tag_count_reward/std": 0.30606386065483093, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1068.212158203125, + "completions/mean_terminated_length": 825.3119506835938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.13105322039315967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13084053709784318, + "kl": 0.0151214599609375, + "learning_rate": 9.974204157154284e-07, + "loss": 0.0942, + "num_tokens": 402557200.0, + "reward": 1.2215402126312256, + "reward_std": 0.4468922019004822, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7237723469734192, + "rewards/tag_count_reward/std": 0.305172860622406, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1075.763427734375, + "completions/mean_terminated_length": 834.7353515625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.1312663150604656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1284271053254254, + "kl": 0.0175933837890625, + "learning_rate": 9.973844986217606e-07, + "loss": 0.0858, + "num_tokens": 403103398.0, + "reward": 1.2935268878936768, + "reward_std": 0.35578545928001404, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.7555803656578064, + "rewards/tag_count_reward/std": 0.3114555776119232, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1018.90185546875, + "completions/mean_terminated_length": 777.9284057617188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.13147940972777156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15305936447695034, + "kl": 0.017303466796875, + "learning_rate": 9.973483339350173e-07, + "loss": 0.0856, + "num_tokens": 403639834.0, + "reward": 1.1506696939468384, + "reward_std": 0.38870009779930115, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7198660969734192, + "rewards/tag_count_reward/std": 0.3032013773918152, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1078.1920166015625, + "completions/mean_terminated_length": 847.7955932617188, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.13169250439507751, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1292829583228314, + "kl": 0.01483154296875, + "learning_rate": 9.973119216752129e-07, + "loss": 0.0648, + "num_tokens": 404193264.0, + "reward": 1.2650669813156128, + "reward_std": 0.44154465198516846, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7516741156578064, + "rewards/tag_count_reward/std": 0.2984345257282257, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 992.4777221679688, + "completions/mean_terminated_length": 783.6310424804688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.13190559906238347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1516340580583699, + "kl": 0.01800537109375, + "learning_rate": 9.972752618624986e-07, + "loss": 0.1207, + "num_tokens": 404698950.0, + "reward": 1.3119419813156128, + "reward_std": 0.41222789883613586, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7806919813156128, + "rewards/tag_count_reward/std": 0.28187403082847595, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1199.415283203125, + "completions/mean_terminated_length": 923.24853515625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1321186937296894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1237324935344492, + "kl": 0.0140380859375, + "learning_rate": 9.97238354517163e-07, + "loss": 0.1148, + "num_tokens": 405305856.0, + "reward": 1.1434152126312256, + "reward_std": 0.4352182149887085, + "rewards/accuracy_reward/mean": 0.44212964177131653, + "rewards/accuracy_reward/std": 0.4972155690193176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7170758843421936, + "rewards/tag_count_reward/std": 0.3072642683982849, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1131.3348388671875, + "completions/mean_terminated_length": 904.0835571289062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.13233178839699536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13563462497660211, + "kl": 0.017669677734375, + "learning_rate": 9.972011996596311e-07, + "loss": 0.0806, + "num_tokens": 405878694.0, + "reward": 1.364397406578064, + "reward_std": 0.39978983998298645, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7818080186843872, + "rewards/tag_count_reward/std": 0.2957916557788849, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1010.6250610351562, + "completions/mean_terminated_length": 818.5184936523438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.13254488306430132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1401930975661383, + "kl": 0.0185546875, + "learning_rate": 9.971637973104656e-07, + "loss": 0.0855, + "num_tokens": 406398078.0, + "reward": 1.4313616752624512, + "reward_std": 0.43678978085517883, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.27621012926101685, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1185.200927734375, + "completions/mean_terminated_length": 901.0148315429688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.13275797773160727, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12895423973973746, + "kl": 0.01519775390625, + "learning_rate": 9.971261474903658e-07, + "loss": 0.1075, + "num_tokens": 407000456.0, + "reward": 1.1372768878936768, + "reward_std": 0.39142924547195435, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7154017686843872, + "rewards/tag_count_reward/std": 0.2966582179069519, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1261.4375, + "completions/mean_terminated_length": 946.8125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.1329710723989132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11316949963594328, + "kl": 0.013427734375, + "learning_rate": 9.970882502201679e-07, + "loss": 0.09, + "num_tokens": 407639036.0, + "reward": 1.1082589626312256, + "reward_std": 0.39533933997154236, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6863839030265808, + "rewards/tag_count_reward/std": 0.3121762275695801, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1241.669677734375, + "completions/mean_terminated_length": 886.4694213867188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.13318416706621916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10642194438336465, + "kl": 0.0135040283203125, + "learning_rate": 9.970501055208453e-07, + "loss": 0.089, + "num_tokens": 408261816.0, + "reward": 1.1902902126312256, + "reward_std": 0.4263664186000824, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6969866156578064, + "rewards/tag_count_reward/std": 0.337540864944458, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1141.51123046875, + "completions/mean_terminated_length": 884.3696899414062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.13339726173352512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1735016125967563, + "kl": 0.019012451171875, + "learning_rate": 9.97011713413508e-07, + "loss": 0.0894, + "num_tokens": 408844989.0, + "reward": 1.2645089626312256, + "reward_std": 0.421338826417923, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7287946343421936, + "rewards/tag_count_reward/std": 0.3121282458305359, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1154.618408203125, + "completions/mean_terminated_length": 910.96875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.13361035640083108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12804612253774367, + "kl": 0.0163726806640625, + "learning_rate": 9.96973073919403e-07, + "loss": 0.1015, + "num_tokens": 409429666.0, + "reward": 1.3203125, + "reward_std": 0.45484986901283264, + "rewards/accuracy_reward/mean": 0.6111111044883728, + "rewards/accuracy_reward/std": 0.488063246011734, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.7287946343421936, + "rewards/tag_count_reward/std": 0.31657615303993225, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1148.4241943359375, + "completions/mean_terminated_length": 788.59375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.13382345106813703, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13172104875519944, + "kl": 0.0131683349609375, + "learning_rate": 9.969341870599148e-07, + "loss": 0.1081, + "num_tokens": 410010736.0, + "reward": 1.10546875, + "reward_std": 0.36399808526039124, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6947544813156128, + "rewards/tag_count_reward/std": 0.3317473530769348, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1271.26123046875, + "completions/mean_terminated_length": 1039.365234375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.13403654573544296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11875267632845464, + "kl": 0.015411376953125, + "learning_rate": 9.968950528565637e-07, + "loss": 0.0859, + "num_tokens": 410656981.0, + "reward": 1.2533482313156128, + "reward_std": 0.42358607053756714, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7287946343421936, + "rewards/tag_count_reward/std": 0.3053349256515503, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1029.671875, + "completions/mean_terminated_length": 798.1068725585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.13424964040274892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13772995985420225, + "kl": 0.015655517578125, + "learning_rate": 9.968556713310077e-07, + "loss": 0.1385, + "num_tokens": 411184754.0, + "reward": 1.266741156578064, + "reward_std": 0.39507296681404114, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7555803656578064, + "rewards/tag_count_reward/std": 0.30141741037368774, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1140.6317138671875, + "completions/mean_terminated_length": 866.31103515625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.13446273507005488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20208071788812898, + "kl": 0.0176239013671875, + "learning_rate": 9.968160425050418e-07, + "loss": 0.1291, + "num_tokens": 411766349.0, + "reward": 1.227678656578064, + "reward_std": 0.4629303216934204, + "rewards/accuracy_reward/mean": 0.5462962985038757, + "rewards/accuracy_reward/std": 0.4984292685985565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7008928656578064, + "rewards/tag_count_reward/std": 0.32311972975730896, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1240.134033203125, + "completions/mean_terminated_length": 902.6708984375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.13467582973736084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11959270741761663, + "kl": 0.01312255859375, + "learning_rate": 9.96776166400597e-07, + "loss": 0.1143, + "num_tokens": 412390121.0, + "reward": 1.08203125, + "reward_std": 0.39291685819625854, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6847098469734192, + "rewards/tag_count_reward/std": 0.31870388984680176, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1227.71435546875, + "completions/mean_terminated_length": 931.0151977539062, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.13488892440466677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13621065368008214, + "kl": 0.0145721435546875, + "learning_rate": 9.967360430397418e-07, + "loss": 0.1484, + "num_tokens": 413015337.0, + "reward": 1.2527902126312256, + "reward_std": 0.4855668544769287, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7081473469734192, + "rewards/tag_count_reward/std": 0.3160606324672699, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1186.27685546875, + "completions/mean_terminated_length": 885.1927490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.13510201907197272, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1238694995629794, + "kl": 0.0150909423828125, + "learning_rate": 9.966956724446816e-07, + "loss": 0.0906, + "num_tokens": 413615781.0, + "reward": 1.1512277126312256, + "reward_std": 0.4346998333930969, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7025669813156128, + "rewards/tag_count_reward/std": 0.3125973641872406, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1231.29248046875, + "completions/mean_terminated_length": 939.257568359375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.13531511373927868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12399338572859975, + "kl": 0.0149078369140625, + "learning_rate": 9.966550546377586e-07, + "loss": 0.0786, + "num_tokens": 414231528.0, + "reward": 1.161272406578064, + "reward_std": 0.4347052574157715, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7081473469734192, + "rewards/tag_count_reward/std": 0.3029598891735077, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1243.3616943359375, + "completions/mean_terminated_length": 925.0155639648438, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.13552820840658464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13078842583908154, + "kl": 0.013336181640625, + "learning_rate": 9.96614189641451e-07, + "loss": 0.1098, + "num_tokens": 414865738.0, + "reward": 1.1272321939468384, + "reward_std": 0.48820939660072327, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6540178656578064, + "rewards/tag_count_reward/std": 0.32938748598098755, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1139.2366943359375, + "completions/mean_terminated_length": 897.9265747070312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.13574130307389057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6761653997003791, + "kl": 0.0206756591796875, + "learning_rate": 9.96573077478375e-07, + "loss": 0.0816, + "num_tokens": 415450980.0, + "reward": 1.165178656578064, + "reward_std": 0.3777007460594177, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.494759202003479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7410714030265808, + "rewards/tag_count_reward/std": 0.30730947852134705, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1251.091552734375, + "completions/mean_terminated_length": 833.6632690429688, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.13595439774119653, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12935028109246918, + "kl": 0.01348876953125, + "learning_rate": 9.96531718171283e-07, + "loss": 0.1035, + "num_tokens": 416084061.0, + "reward": 1.1986607313156128, + "reward_std": 0.4210280179977417, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6919642686843872, + "rewards/tag_count_reward/std": 0.3535251319408417, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1135.51123046875, + "completions/mean_terminated_length": 852.6929931640625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.13616749240850248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1288742480884207, + "kl": 0.01458740234375, + "learning_rate": 9.96490111743064e-07, + "loss": 0.1407, + "num_tokens": 416660226.0, + "reward": 1.262834906578064, + "reward_std": 0.42723309993743896, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7360491156578064, + "rewards/tag_count_reward/std": 0.29481083154678345, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1210.53125, + "completions/mean_terminated_length": 954.163330078125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.13638058707580844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12220346223242336, + "kl": 0.0132598876953125, + "learning_rate": 9.96448258216744e-07, + "loss": 0.0995, + "num_tokens": 417271024.0, + "reward": 1.2477679252624512, + "reward_std": 0.4914741814136505, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.30976149439811707, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1124.0960693359375, + "completions/mean_terminated_length": 844.7761840820312, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.13659368174311437, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11928098244949728, + "kl": 0.0154266357421875, + "learning_rate": 9.964061576154856e-07, + "loss": 0.1122, + "num_tokens": 417846443.0, + "reward": 1.3459821939468384, + "reward_std": 0.3628884255886078, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7522321343421936, + "rewards/tag_count_reward/std": 0.3137339651584625, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1043.5826416015625, + "completions/mean_terminated_length": 804.964111328125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.13680677641042033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13661580769527956, + "kl": 0.0164337158203125, + "learning_rate": 9.963638099625888e-07, + "loss": 0.1131, + "num_tokens": 418378000.0, + "reward": 1.2661831378936768, + "reward_std": 0.3823888599872589, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7728794813156128, + "rewards/tag_count_reward/std": 0.29708874225616455, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1150.5179443359375, + "completions/mean_terminated_length": 885.9421997070312, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.13701987107772629, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12445545588732315, + "kl": 0.01507568359375, + "learning_rate": 9.963212152814892e-07, + "loss": 0.0693, + "num_tokens": 418959960.0, + "reward": 1.2444196939468384, + "reward_std": 0.4136959910392761, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7645089030265808, + "rewards/tag_count_reward/std": 0.2897607386112213, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1031.5201416015625, + "completions/mean_terminated_length": 803.7841186523438, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.13723296574503224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1345315641334434, + "kl": 0.015289306640625, + "learning_rate": 9.962783735957599e-07, + "loss": 0.1082, + "num_tokens": 419494113.0, + "reward": 1.317522406578064, + "reward_std": 0.37469494342803955, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855977296829224, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7728794813156128, + "rewards/tag_count_reward/std": 0.2786311209201813, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 998.169677734375, + "completions/mean_terminated_length": 816.7853393554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.13744606041233817, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13391975107457185, + "kl": 0.0179595947265625, + "learning_rate": 9.962352849291106e-07, + "loss": 0.124, + "num_tokens": 420007533.0, + "reward": 1.3856027126312256, + "reward_std": 0.4255538880825043, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.27235499024391174, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1168.640625, + "completions/mean_terminated_length": 879.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.13765915507964413, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12463116313955848, + "kl": 0.0140838623046875, + "learning_rate": 9.961919493053876e-07, + "loss": 0.1024, + "num_tokens": 420605900.0, + "reward": 1.157366156578064, + "reward_std": 0.34890905022621155, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7287946343421936, + "rewards/tag_count_reward/std": 0.3053349256515503, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1035.0335693359375, + "completions/mean_terminated_length": 853.7658081054688, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1378722497469501, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13722399284732453, + "kl": 0.019561767578125, + "learning_rate": 9.961483667485734e-07, + "loss": 0.0861, + "num_tokens": 421138139.0, + "reward": 1.3939732313156128, + "reward_std": 0.3611336052417755, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7912946343421936, + "rewards/tag_count_reward/std": 0.2871640622615814, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1148.462158203125, + "completions/mean_terminated_length": 937.8264770507812, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.13808534441425604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1200003729768075, + "kl": 0.014678955078125, + "learning_rate": 9.961045372827882e-07, + "loss": 0.0774, + "num_tokens": 421720426.0, + "reward": 1.3264509439468384, + "reward_std": 0.4491410553455353, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7639508843421936, + "rewards/tag_count_reward/std": 0.3009132444858551, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1108.8035888671875, + "completions/mean_terminated_length": 872.6926879882812, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.13829843908156197, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1199079648018838, + "kl": 0.016571044921875, + "learning_rate": 9.96060460932288e-07, + "loss": 0.1102, + "num_tokens": 422282018.0, + "reward": 1.3219866752624512, + "reward_std": 0.4110329747200012, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7728794813156128, + "rewards/tag_count_reward/std": 0.28508007526397705, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1128.1317138671875, + "completions/mean_terminated_length": 887.152099609375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.13851153374886793, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12007830407921395, + "kl": 0.0161590576171875, + "learning_rate": 9.960161377214657e-07, + "loss": 0.092, + "num_tokens": 422853725.0, + "reward": 1.1077009439468384, + "reward_std": 0.313351035118103, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7170758843421936, + "rewards/tag_count_reward/std": 0.29755479097366333, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1018.72998046875, + "completions/mean_terminated_length": 828.1243286132812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1387246284161739, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13035382031757464, + "kl": 0.01568603515625, + "learning_rate": 9.959715676748508e-07, + "loss": 0.0787, + "num_tokens": 423376068.0, + "reward": 1.3069196939468384, + "reward_std": 0.44363412261009216, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7801339030265808, + "rewards/tag_count_reward/std": 0.28069183230400085, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1123.0982666015625, + "completions/mean_terminated_length": 822.0946655273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.13893772308347985, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13919123000043887, + "kl": 0.01605224609375, + "learning_rate": 9.959267508171093e-07, + "loss": 0.1221, + "num_tokens": 423943728.0, + "reward": 1.1623884439468384, + "reward_std": 0.3710832893848419, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7338169813156128, + "rewards/tag_count_reward/std": 0.29799917340278625, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1267.837158203125, + "completions/mean_terminated_length": 875.1375732421875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.13915081775078578, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.636419193272952, + "kl": 0.077545166015625, + "learning_rate": 9.958816871730442e-07, + "loss": 0.1224, + "num_tokens": 424587191.0, + "reward": 1.087053656578064, + "reward_std": 0.4358670115470886, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6986607313156128, + "rewards/tag_count_reward/std": 0.336348295211792, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1133.1741943359375, + "completions/mean_terminated_length": 842.5823974609375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.13936391241809173, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1244959609911837, + "kl": 0.0164794921875, + "learning_rate": 9.958363767675943e-07, + "loss": 0.0849, + "num_tokens": 425167685.0, + "reward": 1.2583706378936768, + "reward_std": 0.3871222138404846, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7472098469734192, + "rewards/tag_count_reward/std": 0.294179230928421, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1158.602783203125, + "completions/mean_terminated_length": 844.2235717773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1395770070853977, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12294051632959824, + "kl": 0.0159149169921875, + "learning_rate": 9.95790819625836e-07, + "loss": 0.0517, + "num_tokens": 425755043.0, + "reward": 1.231584906578064, + "reward_std": 0.41363105177879333, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7405133843421936, + "rewards/tag_count_reward/std": 0.3102361857891083, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1186.7857666015625, + "completions/mean_terminated_length": 970.279296875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.13979010175270365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.135112353243703, + "kl": 0.019775390625, + "learning_rate": 9.957450157729813e-07, + "loss": 0.1379, + "num_tokens": 426365235.0, + "reward": 1.2209821939468384, + "reward_std": 0.42713111639022827, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7790178656578064, + "rewards/tag_count_reward/std": 0.2963150143623352, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1196.7857666015625, + "completions/mean_terminated_length": 939.4418334960938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.14000319642000958, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13049455926595557, + "kl": 0.017578125, + "learning_rate": 9.95698965234379e-07, + "loss": 0.0832, + "num_tokens": 426967011.0, + "reward": 1.2806919813156128, + "reward_std": 0.4483509957790375, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7561383843421936, + "rewards/tag_count_reward/std": 0.3025640547275543, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1149.5067138671875, + "completions/mean_terminated_length": 877.8692016601562, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.14021629108731554, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12853634277442424, + "kl": 0.019256591796875, + "learning_rate": 9.956526680355151e-07, + "loss": 0.1195, + "num_tokens": 427556262.0, + "reward": 1.1969866752624512, + "reward_std": 0.42253217101097107, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7438616156578064, + "rewards/tag_count_reward/std": 0.3066949248313904, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1021.05810546875, + "completions/mean_terminated_length": 846.7728881835938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1404293857546215, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13510570519071793, + "kl": 0.0189208984375, + "learning_rate": 9.956061242020112e-07, + "loss": 0.0607, + "num_tokens": 428078976.0, + "reward": 1.4062501192092896, + "reward_std": 0.3642091155052185, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.28867512941360474, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1139.888427734375, + "completions/mean_terminated_length": 917.9055786132812, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.14064248042192745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13262987804753062, + "kl": 0.01873779296875, + "learning_rate": 9.955593337596257e-07, + "loss": 0.0913, + "num_tokens": 428659838.0, + "reward": 1.3716518878936768, + "reward_std": 0.4764446020126343, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.2984538972377777, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1211.419677734375, + "completions/mean_terminated_length": 935.8694458007812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.14085557508923338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38541013683293884, + "kl": 0.020721435546875, + "learning_rate": 9.955122967342536e-07, + "loss": 0.0893, + "num_tokens": 429274122.0, + "reward": 1.203125, + "reward_std": 0.4242902398109436, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7455357313156128, + "rewards/tag_count_reward/std": 0.3105745017528534, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1206.0335693359375, + "completions/mean_terminated_length": 922.02685546875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.14106866975653934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11643248789949381, + "kl": 0.01495361328125, + "learning_rate": 9.954650131519264e-07, + "loss": 0.0502, + "num_tokens": 429880953.0, + "reward": 1.1629464626312256, + "reward_std": 0.3195909559726715, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7366071343421936, + "rewards/tag_count_reward/std": 0.2955472469329834, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1169.2567138671875, + "completions/mean_terminated_length": 872.8447875976562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.1412817644238453, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12806787224321653, + "kl": 0.0146484375, + "learning_rate": 9.95417483038812e-07, + "loss": 0.1192, + "num_tokens": 430477500.0, + "reward": 1.176897406578064, + "reward_std": 0.45724910497665405, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.7327008843421936, + "rewards/tag_count_reward/std": 0.325735867023468, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1124.196533203125, + "completions/mean_terminated_length": 868.9002685546875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.14149485909115125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1188481689409286, + "kl": 0.0155792236328125, + "learning_rate": 9.953697064212145e-07, + "loss": 0.0819, + "num_tokens": 431051140.0, + "reward": 1.3063616752624512, + "reward_std": 0.35998496413230896, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7572544813156128, + "rewards/tag_count_reward/std": 0.30253928899765015, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1222.5513916015625, + "completions/mean_terminated_length": 877.7437133789062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.14170795375845718, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13900708151281793, + "kl": 0.0157470703125, + "learning_rate": 9.953216833255745e-07, + "loss": 0.0878, + "num_tokens": 431665499.0, + "reward": 1.1155134439468384, + "reward_std": 0.4503583014011383, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7092633843421936, + "rewards/tag_count_reward/std": 0.3330056071281433, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1201.6004638671875, + "completions/mean_terminated_length": 922.8160400390625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.14192104842576314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14679938431054437, + "kl": 0.020843505859375, + "learning_rate": 9.952734137784693e-07, + "loss": 0.0901, + "num_tokens": 432276552.0, + "reward": 1.172991156578064, + "reward_std": 0.36652687191963196, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7444196343421936, + "rewards/tag_count_reward/std": 0.3101058900356293, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1067.90625, + "completions/mean_terminated_length": 824.9303588867188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.1421341430930691, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13679206189772566, + "kl": 0.0166015625, + "learning_rate": 9.952248978066123e-07, + "loss": 0.1453, + "num_tokens": 432824030.0, + "reward": 1.21484375, + "reward_std": 0.40582770109176636, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.28955546021461487, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1195.274658203125, + "completions/mean_terminated_length": 904.224609375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.14234723776037506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13111975263057254, + "kl": 0.0186767578125, + "learning_rate": 9.951761354368534e-07, + "loss": 0.0692, + "num_tokens": 433423625.0, + "reward": 1.3537946939468384, + "reward_std": 0.4087453782558441, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7645089030265808, + "rewards/tag_count_reward/std": 0.33413445949554443, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1120.404052734375, + "completions/mean_terminated_length": 936.8690185546875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.14256033242768099, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12232292897697075, + "kl": 0.0156402587890625, + "learning_rate": 9.95127126696179e-07, + "loss": 0.1035, + "num_tokens": 433996782.0, + "reward": 1.274553656578064, + "reward_std": 0.42003926634788513, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7633928656578064, + "rewards/tag_count_reward/std": 0.27849724888801575, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1010.3750610351562, + "completions/mean_terminated_length": 760.3102416992188, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.14277342709498694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1395553786982362, + "kl": 0.017791748046875, + "learning_rate": 9.950778716117116e-07, + "loss": 0.1133, + "num_tokens": 434517526.0, + "reward": 1.26953125, + "reward_std": 0.35728737711906433, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7806919813156128, + "rewards/tag_count_reward/std": 0.2944888472557068, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1047.2879638671875, + "completions/mean_terminated_length": 816.3544311523438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1429865217622929, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1196272266497482, + "kl": 0.01806640625, + "learning_rate": 9.950283702107098e-07, + "loss": 0.047, + "num_tokens": 435059271.0, + "reward": 1.3928571939468384, + "reward_std": 0.35737544298171997, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7946428656578064, + "rewards/tag_count_reward/std": 0.28124499320983887, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1198.7523193359375, + "completions/mean_terminated_length": 932.272705078125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.14319961642959886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1260055842613297, + "kl": 0.016998291015625, + "learning_rate": 9.949786225205693e-07, + "loss": 0.0711, + "num_tokens": 435663080.0, + "reward": 1.2321429252624512, + "reward_std": 0.4431600868701935, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7566964030265808, + "rewards/tag_count_reward/std": 0.3118821680545807, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1073.26123046875, + "completions/mean_terminated_length": 867.7756958007812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.1434127110969048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13601337713735054, + "kl": 0.018524169921875, + "learning_rate": 9.949286285688215e-07, + "loss": 0.0904, + "num_tokens": 436212333.0, + "reward": 1.3789063692092896, + "reward_std": 0.3620477616786957, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7940848469734192, + "rewards/tag_count_reward/std": 0.27198344469070435, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1072.62060546875, + "completions/mean_terminated_length": 854.0928955078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14362580576421075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.136176703848127, + "kl": 0.0145111083984375, + "learning_rate": 9.94878388383134e-07, + "loss": 0.1493, + "num_tokens": 436758611.0, + "reward": 1.20703125, + "reward_std": 0.4154183566570282, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7494419813156128, + "rewards/tag_count_reward/std": 0.29702988266944885, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1164.8170166015625, + "completions/mean_terminated_length": 914.2865600585938, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.1438389004315167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13341127566071212, + "kl": 0.013092041015625, + "learning_rate": 9.948279019913111e-07, + "loss": 0.1066, + "num_tokens": 437354481.0, + "reward": 1.1618304252624512, + "reward_std": 0.3986019492149353, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7399553656578064, + "rewards/tag_count_reward/std": 0.2994394302368164, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1141.779052734375, + "completions/mean_terminated_length": 817.736328125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.14405199509882266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8908448030934071, + "kl": 0.023681640625, + "learning_rate": 9.947771694212933e-07, + "loss": 0.0828, + "num_tokens": 437940734.0, + "reward": 1.1194196939468384, + "reward_std": 0.44295674562454224, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6662946343421936, + "rewards/tag_count_reward/std": 0.32764512300491333, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1057.8148193359375, + "completions/mean_terminated_length": 832.6493530273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1442650897661286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1345079693676448, + "kl": 0.015411376953125, + "learning_rate": 9.947261907011568e-07, + "loss": 0.1195, + "num_tokens": 438485115.0, + "reward": 1.2154018878936768, + "reward_std": 0.4091176390647888, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7555803656578064, + "rewards/tag_count_reward/std": 0.3114555776119232, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1218.810302734375, + "completions/mean_terminated_length": 961.8099365234375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.14447818443343455, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11610939918420013, + "kl": 0.013336181640625, + "learning_rate": 9.946749658591147e-07, + "loss": 0.1242, + "num_tokens": 439104230.0, + "reward": 1.1869419813156128, + "reward_std": 0.4558294415473938, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7472098469734192, + "rewards/tag_count_reward/std": 0.3085617125034332, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1103.5357666015625, + "completions/mean_terminated_length": 925.665771484375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1446912791007405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24679484703237348, + "kl": 0.017578125, + "learning_rate": 9.946234949235159e-07, + "loss": 0.1552, + "num_tokens": 439673126.0, + "reward": 1.2388393878936768, + "reward_std": 0.4114640951156616, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7522321343421936, + "rewards/tag_count_reward/std": 0.29251575469970703, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1076.08935546875, + "completions/mean_terminated_length": 858.3387451171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14490437376804646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12698216031062878, + "kl": 0.0137939453125, + "learning_rate": 9.945717779228458e-07, + "loss": 0.1167, + "num_tokens": 440224462.0, + "reward": 1.1127232313156128, + "reward_std": 0.41351184248924255, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7466517686843872, + "rewards/tag_count_reward/std": 0.29771679639816284, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1216.7701416015625, + "completions/mean_terminated_length": 873.2650146484375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.1451174684353524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12372034551200503, + "kl": 0.0140533447265625, + "learning_rate": 9.945198148857257e-07, + "loss": 0.1462, + "num_tokens": 440839079.0, + "reward": 1.2103794813156128, + "reward_std": 0.46855056285858154, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7237723469734192, + "rewards/tag_count_reward/std": 0.31686145067214966, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1153.51123046875, + "completions/mean_terminated_length": 858.88720703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14533056310265835, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12119071938007452, + "kl": 0.0131683349609375, + "learning_rate": 9.944676058409132e-07, + "loss": 0.1452, + "num_tokens": 441432588.0, + "reward": 1.1032366752624512, + "reward_std": 0.4475897252559662, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7059151530265808, + "rewards/tag_count_reward/std": 0.31308820843696594, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1022.5045166015625, + "completions/mean_terminated_length": 771.8278198242188, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1455436577699643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.134775360114139, + "kl": 0.016876220703125, + "learning_rate": 9.944151508173017e-07, + "loss": 0.1258, + "num_tokens": 441957806.0, + "reward": 1.23046875, + "reward_std": 0.3669811189174652, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7349330186843872, + "rewards/tag_count_reward/std": 0.31978318095207214, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1216.10498046875, + "completions/mean_terminated_length": 883.3468627929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.14575675243727026, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11850368954786407, + "kl": 0.013824462890625, + "learning_rate": 9.943624498439214e-07, + "loss": 0.0948, + "num_tokens": 442571677.0, + "reward": 1.1774554252624512, + "reward_std": 0.46347200870513916, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7511160969734192, + "rewards/tag_count_reward/std": 0.29584914445877075, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1017.0714721679688, + "completions/mean_terminated_length": 854.5736694335938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1459698471045762, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13327447595877517, + "kl": 0.0148773193359375, + "learning_rate": 9.943095029499382e-07, + "loss": 0.0982, + "num_tokens": 443098621.0, + "reward": 1.2181919813156128, + "reward_std": 0.3946465849876404, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.2607157230377197, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1103.5223388671875, + "completions/mean_terminated_length": 859.44384765625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.14618294177188215, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13797045192875015, + "kl": 0.0156402587890625, + "learning_rate": 9.942563101646544e-07, + "loss": 0.1473, + "num_tokens": 443665735.0, + "reward": 1.2371652126312256, + "reward_std": 0.4235488176345825, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7527901530265808, + "rewards/tag_count_reward/std": 0.29795730113983154, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1197.94873046875, + "completions/mean_terminated_length": 861.635498046875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1463960364391881, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13630735009608785, + "kl": 0.015106201171875, + "learning_rate": 9.942028715175076e-07, + "loss": 0.1282, + "num_tokens": 444270976.0, + "reward": 1.1841518878936768, + "reward_std": 0.45379647612571716, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.33310166001319885, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 989.3817138671875, + "completions/mean_terminated_length": 786.6675415039062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.14660913110649407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1335243266387221, + "kl": 0.015411376953125, + "learning_rate": 9.94149187038072e-07, + "loss": 0.0808, + "num_tokens": 444781211.0, + "reward": 1.2371652126312256, + "reward_std": 0.4472712278366089, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7840401530265808, + "rewards/tag_count_reward/std": 0.2677413523197174, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1032.8304443359375, + "completions/mean_terminated_length": 835.2106323242188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.1468222257738, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1317542465611623, + "kl": 0.0149383544921875, + "learning_rate": 9.940952567560585e-07, + "loss": 0.0915, + "num_tokens": 445320399.0, + "reward": 1.2209821939468384, + "reward_std": 0.37385305762290955, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7901785969734192, + "rewards/tag_count_reward/std": 0.28926268219947815, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 925.7701416015625, + "completions/mean_terminated_length": 787.952392578125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.14703532044110595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13703578094580346, + "kl": 0.01751708984375, + "learning_rate": 9.940410807013129e-07, + "loss": 0.0401, + "num_tokens": 445805864.0, + "reward": 1.2974331378936768, + "reward_std": 0.33549997210502625, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8041294813156128, + "rewards/tag_count_reward/std": 0.26174795627593994, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 903.7656860351562, + "completions/mean_terminated_length": 719.9766845703125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1472484151084119, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1437043969359983, + "kl": 0.017486572265625, + "learning_rate": 9.939866589038172e-07, + "loss": 0.0918, + "num_tokens": 446277695.0, + "reward": 1.3286831378936768, + "reward_std": 0.34816738963127136, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7974330186843872, + "rewards/tag_count_reward/std": 0.2864561975002289, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1056.76123046875, + "completions/mean_terminated_length": 817.8753051757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14746150977571787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13240029643150733, + "kl": 0.015625, + "learning_rate": 9.939319913936906e-07, + "loss": 0.0807, + "num_tokens": 446823428.0, + "reward": 1.234375, + "reward_std": 0.44145601987838745, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7700892686843872, + "rewards/tag_count_reward/std": 0.3003323972225189, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 892.4129638671875, + "completions/mean_terminated_length": 747.2387084960938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.14767460444302383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1363330965749457, + "kl": 0.019622802734375, + "learning_rate": 9.938770782011864e-07, + "loss": 0.0399, + "num_tokens": 447284013.0, + "reward": 1.3727679252624512, + "reward_std": 0.3388535678386688, + "rewards/accuracy_reward/mean": 0.5555555820465088, + "rewards/accuracy_reward/std": 0.4974800646305084, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8370535969734192, + "rewards/tag_count_reward/std": 0.25296854972839355, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 929.7723388671875, + "completions/mean_terminated_length": 779.7316284179688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14788769911032976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13918389062211142, + "kl": 0.017791748046875, + "learning_rate": 9.938219193566956e-07, + "loss": 0.0554, + "num_tokens": 447767015.0, + "reward": 1.4681919813156128, + "reward_std": 0.37598365545272827, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8297991156578064, + "rewards/tag_count_reward/std": 0.26157617568969727, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 994.7366333007812, + "completions/mean_terminated_length": 838.0974731445312, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.14810079377763571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13991430281370598, + "kl": 0.0179901123046875, + "learning_rate": 9.93766514890744e-07, + "loss": 0.0429, + "num_tokens": 448286225.0, + "reward": 1.2840402126312256, + "reward_std": 0.366243451833725, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7974330186843872, + "rewards/tag_count_reward/std": 0.2603755295276642, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1097.388427734375, + "completions/mean_terminated_length": 896.9891967773438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.14831388844494167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11412828181744235, + "kl": 0.0153961181640625, + "learning_rate": 9.937108648339939e-07, + "loss": 0.0436, + "num_tokens": 448841695.0, + "reward": 1.2678571939468384, + "reward_std": 0.45982861518859863, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8102678656578064, + "rewards/tag_count_reward/std": 0.2722134292125702, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1125.7523193359375, + "completions/mean_terminated_length": 864.1404418945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14852698311224763, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1345371092731451, + "kl": 0.0174713134765625, + "learning_rate": 9.936549692172433e-07, + "loss": 0.1495, + "num_tokens": 449411200.0, + "reward": 1.2645089626312256, + "reward_std": 0.4366303086280823, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.30836185812950134, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 950.1920166015625, + "completions/mean_terminated_length": 777.1524658203125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.14874007777955356, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1284.670829674342, + "kl": 29.88507080078125, + "learning_rate": 9.93598828071426e-07, + "loss": 1.2511, + "num_tokens": 449910246.0, + "reward": 1.4023438692092896, + "reward_std": 0.33358901739120483, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8286830186843872, + "rewards/tag_count_reward/std": 0.2744280993938446, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1084.5179443359375, + "completions/mean_terminated_length": 807.6551513671875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.14895317244685952, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12905867254968686, + "kl": 0.01666259765625, + "learning_rate": 9.93542441427612e-07, + "loss": 0.1086, + "num_tokens": 450466494.0, + "reward": 1.1863839626312256, + "reward_std": 0.3764786124229431, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7712053656578064, + "rewards/tag_count_reward/std": 0.2969778776168823, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1123.6004638671875, + "completions/mean_terminated_length": 878.138427734375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.14916626711416547, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13386271170882727, + "kl": 0.017059326171875, + "learning_rate": 9.934858093170071e-07, + "loss": 0.1489, + "num_tokens": 451039067.0, + "reward": 1.4129464626312256, + "reward_std": 0.4689089059829712, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7857142686843872, + "rewards/tag_count_reward/std": 0.3161519467830658, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1037.99560546875, + "completions/mean_terminated_length": 866.5848999023438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.14937936178147143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11442320032859457, + "kl": 0.01531982421875, + "learning_rate": 9.934289317709526e-07, + "loss": 0.0855, + "num_tokens": 451575849.0, + "reward": 1.4648438692092896, + "reward_std": 0.3985152542591095, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8443080186843872, + "rewards/tag_count_reward/std": 0.27356699109077454, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 999.7277221679688, + "completions/mean_terminated_length": 771.8424072265625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.14959245644877736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6945461007664862, + "kl": 0.0181427001953125, + "learning_rate": 9.93371808820926e-07, + "loss": 0.0832, + "num_tokens": 452092255.0, + "reward": 1.4017857313156128, + "reward_std": 0.4318530559539795, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8191964030265808, + "rewards/tag_count_reward/std": 0.29340213537216187, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 990.8951416015625, + "completions/mean_terminated_length": 833.6846313476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.14980555111608332, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11993278057004805, + "kl": 0.01715087890625, + "learning_rate": 9.933144404985405e-07, + "loss": 0.0949, + "num_tokens": 452604560.0, + "reward": 1.3828126192092896, + "reward_std": 0.35533708333969116, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8113839030265808, + "rewards/tag_count_reward/std": 0.2750307619571686, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1062.774658203125, + "completions/mean_terminated_length": 828.7155151367188, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.15001864578338928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14157479382569624, + "kl": 0.0167083740234375, + "learning_rate": 9.932568268355448e-07, + "loss": 0.1253, + "num_tokens": 453158187.0, + "reward": 1.1501116752624512, + "reward_std": 0.4373627007007599, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7639508843421936, + "rewards/tag_count_reward/std": 0.3123137056827545, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 959.9732666015625, + "completions/mean_terminated_length": 778.6354370117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15023174045069523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13468308929565764, + "kl": 0.0194091796875, + "learning_rate": 9.93198967863824e-07, + "loss": 0.1071, + "num_tokens": 453655455.0, + "reward": 1.3465402126312256, + "reward_std": 0.411983847618103, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8018973469734192, + "rewards/tag_count_reward/std": 0.2792443037033081, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1068.390625, + "completions/mean_terminated_length": 848.915283203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15044483511800116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14071724135958796, + "kl": 0.0152587890625, + "learning_rate": 9.931408636153984e-07, + "loss": 0.1387, + "num_tokens": 454209166.0, + "reward": 1.203125, + "reward_std": 0.41122350096702576, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7723214030265808, + "rewards/tag_count_reward/std": 0.3107031285762787, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1065.734375, + "completions/mean_terminated_length": 787.0974731445312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.15065792978530712, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14508417827500644, + "kl": 0.01678466796875, + "learning_rate": 9.930825141224242e-07, + "loss": 0.0973, + "num_tokens": 454750295.0, + "reward": 1.29296875, + "reward_std": 0.32834291458129883, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7728794813156128, + "rewards/tag_count_reward/std": 0.3022213876247406, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1098.12060546875, + "completions/mean_terminated_length": 865.9277954101562, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.15087102445261308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13611513176707363, + "kl": 0.016754150390625, + "learning_rate": 9.930239194171937e-07, + "loss": 0.1119, + "num_tokens": 455309853.0, + "reward": 1.2868304252624512, + "reward_std": 0.4366273283958435, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7935267686843872, + "rewards/tag_count_reward/std": 0.27539366483688354, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1066.9107666015625, + "completions/mean_terminated_length": 816.8291625976562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.15108411911991904, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14360486659779728, + "kl": 0.0167999267578125, + "learning_rate": 9.929650795321344e-07, + "loss": 0.1054, + "num_tokens": 455862181.0, + "reward": 1.3577009439468384, + "reward_std": 0.3859207034111023, + "rewards/accuracy_reward/mean": 0.5671296119689941, + "rewards/accuracy_reward/std": 0.49604758620262146, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8108258843421936, + "rewards/tag_count_reward/std": 0.2863602936267853, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 966.10498046875, + "completions/mean_terminated_length": 795.5736694335938, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.15129721378722497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14821180553149105, + "kl": 0.02020263671875, + "learning_rate": 9.929059944998096e-07, + "loss": 0.0871, + "num_tokens": 456360868.0, + "reward": 1.4977679252624512, + "reward_std": 0.38166478276252747, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8504464030265808, + "rewards/tag_count_reward/std": 0.25897711515426636, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1044.3616943359375, + "completions/mean_terminated_length": 848.9866333007812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.15151030845453092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15104417443856227, + "kl": 0.017364501953125, + "learning_rate": 9.928466643529185e-07, + "loss": 0.1025, + "num_tokens": 456898198.0, + "reward": 1.3515626192092896, + "reward_std": 0.43090930581092834, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8180803656578064, + "rewards/tag_count_reward/std": 0.28051385283470154, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1136.40625, + "completions/mean_terminated_length": 922.9476928710938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.15172340312183688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12324357361636927, + "kl": 0.0167083740234375, + "learning_rate": 9.927870891242956e-07, + "loss": 0.0962, + "num_tokens": 457475164.0, + "reward": 1.3521206378936768, + "reward_std": 0.37399131059646606, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7963169813156128, + "rewards/tag_count_reward/std": 0.28270989656448364, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1073.0670166015625, + "completions/mean_terminated_length": 834.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.15193649778914284, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13059621304588653, + "kl": 0.0170135498046875, + "learning_rate": 9.927272688469115e-07, + "loss": 0.0954, + "num_tokens": 458027978.0, + "reward": 1.301897406578064, + "reward_std": 0.3749329745769501, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7818080186843872, + "rewards/tag_count_reward/std": 0.32030200958251953, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1022.8995971679688, + "completions/mean_terminated_length": 867.4215698242188, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.15214959245644877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13191018403219468, + "kl": 0.018768310546875, + "learning_rate": 9.926672035538716e-07, + "loss": 0.1032, + "num_tokens": 458550429.0, + "reward": 1.3470982313156128, + "reward_std": 0.3803008794784546, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8158482313156128, + "rewards/tag_count_reward/std": 0.2667539417743683, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1229.44873046875, + "completions/mean_terminated_length": 923.11962890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15236268712375473, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12096691964925119, + "kl": 0.01397705078125, + "learning_rate": 9.926068932784182e-07, + "loss": 0.1473, + "num_tokens": 459181046.0, + "reward": 1.1891741752624512, + "reward_std": 0.4436335861682892, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7606026530265808, + "rewards/tag_count_reward/std": 0.3124455511569977, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1063.6942138671875, + "completions/mean_terminated_length": 865.7775268554688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.15257578179106068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15111345566286427, + "kl": 0.01824951171875, + "learning_rate": 9.92546338053928e-07, + "loss": 0.1155, + "num_tokens": 459722157.0, + "reward": 1.328125, + "reward_std": 0.4215332567691803, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.28088077902793884, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1286.83935546875, + "completions/mean_terminated_length": 995.5308837890625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.15278887645836664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12152851655631211, + "kl": 0.01513671875, + "learning_rate": 9.924855379139136e-07, + "loss": 0.0678, + "num_tokens": 460373573.0, + "reward": 1.1037946939468384, + "reward_std": 0.4427836537361145, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399619221687317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7645089030265808, + "rewards/tag_count_reward/std": 0.3187141716480255, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1154.8973388671875, + "completions/mean_terminated_length": 917.7457885742188, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.15300197112567257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12879353434402135, + "kl": 0.018646240234375, + "learning_rate": 9.924244928920232e-07, + "loss": 0.0832, + "num_tokens": 460959703.0, + "reward": 1.2075893878936768, + "reward_std": 0.3688276708126068, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7611607313156128, + "rewards/tag_count_reward/std": 0.29939982295036316, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1234.384033203125, + "completions/mean_terminated_length": 969.59765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15321506579297853, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11247017455206236, + "kl": 0.0162506103515625, + "learning_rate": 9.923632030220411e-07, + "loss": 0.1086, + "num_tokens": 461588083.0, + "reward": 1.2399554252624512, + "reward_std": 0.42860835790634155, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7555803656578064, + "rewards/tag_count_reward/std": 0.31811821460723877, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1195.759033203125, + "completions/mean_terminated_length": 925.0470581054688, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.15342816046028449, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13108526378125848, + "kl": 0.0157318115234375, + "learning_rate": 9.923016683378858e-07, + "loss": 0.1299, + "num_tokens": 462193287.0, + "reward": 1.3013393878936768, + "reward_std": 0.43348729610443115, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7834821343421936, + "rewards/tag_count_reward/std": 0.3137339651584625, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1172.5, + "completions/mean_terminated_length": 911.118896484375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.15364125512759044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13039838088176925, + "kl": 0.0148162841796875, + "learning_rate": 9.922398888736125e-07, + "loss": 0.1138, + "num_tokens": 462789559.0, + "reward": 1.2985491752624512, + "reward_std": 0.4117524325847626, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8030133843421936, + "rewards/tag_count_reward/std": 0.2974666655063629, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1093.1116943359375, + "completions/mean_terminated_length": 885.5271606445312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.15385434979489637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12760346976845133, + "kl": 0.018646240234375, + "learning_rate": 9.921778646634114e-07, + "loss": 0.0534, + "num_tokens": 463350105.0, + "reward": 1.3978794813156128, + "reward_std": 0.37394532561302185, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8309151530265808, + "rewards/tag_count_reward/std": 0.27224037051200867, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1096.8817138671875, + "completions/mean_terminated_length": 880.6000366210938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.15406744446220233, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13402977516786121, + "kl": 0.018707275390625, + "learning_rate": 9.921155957416078e-07, + "loss": 0.1012, + "num_tokens": 463904276.0, + "reward": 1.4380581378936768, + "reward_std": 0.4119695723056793, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.28169241547584534, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1124.169677734375, + "completions/mean_terminated_length": 865.4971313476562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1542805391295083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19897404578965877, + "kl": 0.01806640625, + "learning_rate": 9.92053082142663e-07, + "loss": 0.1208, + "num_tokens": 464478272.0, + "reward": 1.2561384439468384, + "reward_std": 0.4223388433456421, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7940848469734192, + "rewards/tag_count_reward/std": 0.30949485301971436, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1089.118408203125, + "completions/mean_terminated_length": 861.3176879882812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.15449363379681424, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13392246930947885, + "kl": 0.016998291015625, + "learning_rate": 9.919903239011737e-07, + "loss": 0.102, + "num_tokens": 465028165.0, + "reward": 1.3314732313156128, + "reward_std": 0.35138410329818726, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8158482313156128, + "rewards/tag_count_reward/std": 0.27039816975593567, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1188.8616943359375, + "completions/mean_terminated_length": 922.5789794921875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.15470672846412017, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12228055305980295, + "kl": 0.0157623291015625, + "learning_rate": 9.919273210518715e-07, + "loss": 0.0876, + "num_tokens": 465624967.0, + "reward": 1.3666294813156128, + "reward_std": 0.4116423726081848, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7907366156578064, + "rewards/tag_count_reward/std": 0.32146915793418884, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1089.337158203125, + "completions/mean_terminated_length": 844.9719848632812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15491982313142613, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14355790277747807, + "kl": 0.01849365234375, + "learning_rate": 9.91864073629624e-07, + "loss": 0.1357, + "num_tokens": 466177694.0, + "reward": 1.4051339626312256, + "reward_std": 0.43526557087898254, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8113839030265808, + "rewards/tag_count_reward/std": 0.3090250790119171, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 957.9710083007812, + "completions/mean_terminated_length": 830.2119750976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1551329177987321, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13402672741771077, + "kl": 0.021240234375, + "learning_rate": 9.918005816694333e-07, + "loss": 0.0655, + "num_tokens": 466678049.0, + "reward": 1.3850446939468384, + "reward_std": 0.3447449803352356, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8448660969734192, + "rewards/tag_count_reward/std": 0.24845468997955322, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1354.540283203125, + "completions/mean_terminated_length": 1022.6865234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15534601246603805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4201026315881484, + "kl": 0.0129852294921875, + "learning_rate": 9.917368452064377e-07, + "loss": 0.0896, + "num_tokens": 467367571.0, + "reward": 1.114397406578064, + "reward_std": 0.49074214696884155, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7572544813156128, + "rewards/tag_count_reward/std": 0.33832037448883057, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1094.9866943359375, + "completions/mean_terminated_length": 803.2478637695312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15555910713334398, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12633936669659696, + "kl": 0.0172576904296875, + "learning_rate": 9.916728642759102e-07, + "loss": 0.1436, + "num_tokens": 467925741.0, + "reward": 1.2801339626312256, + "reward_std": 0.3484698235988617, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8180803656578064, + "rewards/tag_count_reward/std": 0.29031166434288025, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 872.591552734375, + "completions/mean_terminated_length": 757.3554077148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15577220180064993, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14002578637452331, + "kl": 0.021575927734375, + "learning_rate": 9.916086389132597e-07, + "loss": 0.067, + "num_tokens": 468390854.0, + "reward": 1.6194196939468384, + "reward_std": 0.3139893114566803, + "rewards/accuracy_reward/mean": 0.734375, + "rewards/accuracy_reward/std": 0.44215917587280273, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.22906675934791565, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1100.2679443359375, + "completions/mean_terminated_length": 862.0111694335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1559852964679559, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11748074405754115, + "kl": 0.01751708984375, + "learning_rate": 9.915441691540297e-07, + "loss": 0.0879, + "num_tokens": 468953918.0, + "reward": 1.3476563692092896, + "reward_std": 0.33276113867759705, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.31494081020355225, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 887.82373046875, + "completions/mean_terminated_length": 725.4580078125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.15619839113526185, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15928226123459177, + "kl": 0.022308349609375, + "learning_rate": 9.914794550338994e-07, + "loss": 0.071, + "num_tokens": 469421503.0, + "reward": 1.551897406578064, + "reward_std": 0.30738532543182373, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8844866156578064, + "rewards/tag_count_reward/std": 0.24897858500480652, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1078.8013916015625, + "completions/mean_terminated_length": 838.5264282226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.15641148580256778, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12418531600802606, + "kl": 0.0156097412109375, + "learning_rate": 9.914144965886833e-07, + "loss": 0.084, + "num_tokens": 469975558.0, + "reward": 1.1891741752624512, + "reward_std": 0.31078848242759705, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.48164102435112, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8253348469734192, + "rewards/tag_count_reward/std": 0.28337591886520386, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1003.5826416015625, + "completions/mean_terminated_length": 832.6779174804688, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.15662458046987374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14321156310072328, + "kl": 0.02032470703125, + "learning_rate": 9.913492938543305e-07, + "loss": 0.0912, + "num_tokens": 470494859.0, + "reward": 1.4285714626312256, + "reward_std": 0.4269440770149231, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8660714030265808, + "rewards/tag_count_reward/std": 0.2528994381427765, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1023.3750610351562, + "completions/mean_terminated_length": 830.408447265625, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1568376751371797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14141820733342655, + "kl": 0.0167236328125, + "learning_rate": 9.91283846866926e-07, + "loss": 0.1505, + "num_tokens": 471021283.0, + "reward": 1.3454241752624512, + "reward_std": 0.4020189642906189, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8543526530265808, + "rewards/tag_count_reward/std": 0.25877881050109863, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1037.341552734375, + "completions/mean_terminated_length": 800.6859741210938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15705076980448565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12850274220007044, + "kl": 0.0179443359375, + "learning_rate": 9.912181556626896e-07, + "loss": 0.0734, + "num_tokens": 471560428.0, + "reward": 1.4743304252624512, + "reward_std": 0.4303416311740875, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.25071555376052856, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1114.390625, + "completions/mean_terminated_length": 814.2035522460938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.15726386447179158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13384207113838262, + "kl": 0.0174560546875, + "learning_rate": 9.911522202779766e-07, + "loss": 0.1161, + "num_tokens": 472125755.0, + "reward": 1.2723214626312256, + "reward_std": 0.3943602740764618, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7946428656578064, + "rewards/tag_count_reward/std": 0.30000796914100647, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1147.52685546875, + "completions/mean_terminated_length": 882.0693359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15747695913909754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12572316341902748, + "kl": 0.015289306640625, + "learning_rate": 9.910860407492768e-07, + "loss": 0.1114, + "num_tokens": 472713415.0, + "reward": 1.2712054252624512, + "reward_std": 0.39238712191581726, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.30665677785873413, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 950.2388916015625, + "completions/mean_terminated_length": 743.4986572265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1576900538064035, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11960700352863632, + "kl": 0.01824951171875, + "learning_rate": 9.910196171132157e-07, + "loss": 0.1202, + "num_tokens": 473203282.0, + "reward": 1.4185268878936768, + "reward_std": 0.4130321443080902, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8649553656578064, + "rewards/tag_count_reward/std": 0.2517491281032562, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1051.40625, + "completions/mean_terminated_length": 838.0433959960938, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.15790314847370945, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13473806243376207, + "kl": 0.017364501953125, + "learning_rate": 9.90952949406554e-07, + "loss": 0.1287, + "num_tokens": 473747640.0, + "reward": 1.3431919813156128, + "reward_std": 0.4228634536266327, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8454241156578064, + "rewards/tag_count_reward/std": 0.2762327194213867, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1029.352783203125, + "completions/mean_terminated_length": 790.8264770507812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15811624314101538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12925176864888338, + "kl": 0.018096923828125, + "learning_rate": 9.908860376661865e-07, + "loss": 0.097, + "num_tokens": 474275606.0, + "reward": 1.3671876192092896, + "reward_std": 0.40404853224754333, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.263893187046051, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1081.4888916015625, + "completions/mean_terminated_length": 855.1708374023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.15832933780832134, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12945618341875803, + "kl": 0.0189208984375, + "learning_rate": 9.908188819291442e-07, + "loss": 0.1244, + "num_tokens": 474833105.0, + "reward": 1.4520089626312256, + "reward_std": 0.42613303661346436, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8694196343421936, + "rewards/tag_count_reward/std": 0.2562901973724365, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1025.040283203125, + "completions/mean_terminated_length": 822.6363525390625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1585424324756273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14299017438568432, + "kl": 0.018035888671875, + "learning_rate": 9.907514822325928e-07, + "loss": 0.0939, + "num_tokens": 475362227.0, + "reward": 1.1930804252624512, + "reward_std": 0.40716296434402466, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8470982313156128, + "rewards/tag_count_reward/std": 0.27741706371307373, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1211.399658203125, + "completions/mean_terminated_length": 919.0933227539062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.15875552714293326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13378513087902502, + "kl": 0.0160980224609375, + "learning_rate": 9.906838386138324e-07, + "loss": 0.1171, + "num_tokens": 475971478.0, + "reward": 1.3013393878936768, + "reward_std": 0.3755875527858734, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.29353824257850647, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1019.8638916015625, + "completions/mean_terminated_length": 851.6233520507812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.15896862181023919, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12413652172631166, + "kl": 0.0186767578125, + "learning_rate": 9.90615951110299e-07, + "loss": 0.0866, + "num_tokens": 476500681.0, + "reward": 1.4765626192092896, + "reward_std": 0.4106946885585785, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8604910969734192, + "rewards/tag_count_reward/std": 0.2602730095386505, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 993.6607666015625, + "completions/mean_terminated_length": 774.8355712890625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.15918171647754514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14079078597187705, + "kl": 0.018707275390625, + "learning_rate": 9.905478197595628e-07, + "loss": 0.0752, + "num_tokens": 477019553.0, + "reward": 1.3783482313156128, + "reward_std": 0.35385826230049133, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.2660040855407715, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1115.1138916015625, + "completions/mean_terminated_length": 912.3125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.1593948111448511, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12223112834888018, + "kl": 0.0166473388671875, + "learning_rate": 9.904794445993294e-07, + "loss": 0.0789, + "num_tokens": 477595748.0, + "reward": 1.2868304252624512, + "reward_std": 0.3850972354412079, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8270089030265808, + "rewards/tag_count_reward/std": 0.2684706449508667, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1117.9107666015625, + "completions/mean_terminated_length": 896.9503173828125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.15960790581215706, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1234044162058948, + "kl": 0.0186767578125, + "learning_rate": 9.904108256674394e-07, + "loss": 0.1167, + "num_tokens": 478161084.0, + "reward": 1.4559152126312256, + "reward_std": 0.4311186373233795, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8621651530265808, + "rewards/tag_count_reward/std": 0.2749870717525482, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 983.5223388671875, + "completions/mean_terminated_length": 837.62939453125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.159821000479463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.144074280425053, + "kl": 0.023834228515625, + "learning_rate": 9.903419630018676e-07, + "loss": 0.0948, + "num_tokens": 478665526.0, + "reward": 1.4860491752624512, + "reward_std": 0.35792022943496704, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8856026530265808, + "rewards/tag_count_reward/std": 0.24152126908302307, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1047.388427734375, + "completions/mean_terminated_length": 836.4486694335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16003409514676895, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13387868305106246, + "kl": 0.019073486328125, + "learning_rate": 9.902728566407248e-07, + "loss": 0.0782, + "num_tokens": 479206916.0, + "reward": 1.4213169813156128, + "reward_std": 0.4172287583351135, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8610491156578064, + "rewards/tag_count_reward/std": 0.26137566566467285, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1201.6295166015625, + "completions/mean_terminated_length": 916.1373291015625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.1602471898140749, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12034827622371361, + "kl": 0.0143890380859375, + "learning_rate": 9.90203506622256e-07, + "loss": 0.1101, + "num_tokens": 479820734.0, + "reward": 1.165178656578064, + "reward_std": 0.41876834630966187, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8236607313156128, + "rewards/tag_count_reward/std": 0.2989324629306793, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1060.671875, + "completions/mean_terminated_length": 822.728515625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.16046028448138086, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13125703208306563, + "kl": 0.01751708984375, + "learning_rate": 9.90133912984841e-07, + "loss": 0.0581, + "num_tokens": 480364011.0, + "reward": 1.3521206378936768, + "reward_std": 0.3559810519218445, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.2688673436641693, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1127.654052734375, + "completions/mean_terminated_length": 869.9571533203125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.16067337914868682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14100222889528688, + "kl": 0.017333984375, + "learning_rate": 9.900640757669943e-07, + "loss": 0.0981, + "num_tokens": 480938480.0, + "reward": 1.321428656578064, + "reward_std": 0.3799952268600464, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.26918813586235046, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1179.7545166015625, + "completions/mean_terminated_length": 982.3178100585938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.16088647381599275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12578606899914582, + "kl": 0.017486572265625, + "learning_rate": 9.899939950073658e-07, + "loss": 0.1053, + "num_tokens": 481537154.0, + "reward": 1.4157366752624512, + "reward_std": 0.4477072060108185, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8643973469734192, + "rewards/tag_count_reward/std": 0.27508240938186646, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1008.2545166015625, + "completions/mean_terminated_length": 785.6531372070312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1610995684832987, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13284076978290218, + "kl": 0.017913818359375, + "learning_rate": 9.899236707447399e-07, + "loss": 0.0999, + "num_tokens": 482053652.0, + "reward": 1.3543527126312256, + "reward_std": 0.34607964754104614, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.28354763984680176, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1031.904052734375, + "completions/mean_terminated_length": 807.64306640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.16131266315060466, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14028527464326643, + "kl": 0.02081298828125, + "learning_rate": 9.898531030180353e-07, + "loss": 0.0651, + "num_tokens": 482588841.0, + "reward": 1.3867188692092896, + "reward_std": 0.4294005334377289, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8510044813156128, + "rewards/tag_count_reward/std": 0.28425124287605286, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1086.419677734375, + "completions/mean_terminated_length": 851.36669921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.16152575781791062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21480122121295042, + "kl": 0.0209197998046875, + "learning_rate": 9.897822918663062e-07, + "loss": 0.1047, + "num_tokens": 483150101.0, + "reward": 1.2918527126312256, + "reward_std": 0.3935893774032593, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8342633843421936, + "rewards/tag_count_reward/std": 0.2882158160209656, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 982.7723388671875, + "completions/mean_terminated_length": 768.58447265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16173885248521655, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14896821734820667, + "kl": 0.019195556640625, + "learning_rate": 9.89711237328741e-07, + "loss": 0.1176, + "num_tokens": 483673471.0, + "reward": 1.2806919813156128, + "reward_std": 0.38954755663871765, + "rewards/accuracy_reward/mean": 0.48317307233810425, + "rewards/accuracy_reward/std": 0.5003184676170349, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.2969752550125122, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 976.07373046875, + "completions/mean_terminated_length": 780.9208984375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1619519471525225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11987655576536607, + "kl": 0.022247314453125, + "learning_rate": 9.896399394446628e-07, + "loss": 0.0776, + "num_tokens": 484182176.0, + "reward": 1.4001116752624512, + "reward_std": 0.3662990927696228, + "rewards/accuracy_reward/mean": 0.5324074029922485, + "rewards/accuracy_reward/std": 0.49952712655067444, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.23972615599632263, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 991.4910888671875, + "completions/mean_terminated_length": 782.44921875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.16216504181982846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1376876735088269, + "kl": 0.018768310546875, + "learning_rate": 9.895683982535298e-07, + "loss": 0.0981, + "num_tokens": 484696636.0, + "reward": 1.4146206378936768, + "reward_std": 0.36274465918540955, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.27093952894210815, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 962.5692138671875, + "completions/mean_terminated_length": 810.6641235351562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16237813648713442, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14300657442732245, + "kl": 0.020263671875, + "learning_rate": 9.894966137949346e-07, + "loss": 0.1081, + "num_tokens": 485200155.0, + "reward": 1.4637277126312256, + "reward_std": 0.3754221200942993, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8900669813156128, + "rewards/tag_count_reward/std": 0.2492642104625702, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1195.638427734375, + "completions/mean_terminated_length": 854.6937866210938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16259123115444035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11529307523180332, + "kl": 0.01556396484375, + "learning_rate": 9.89424586108604e-07, + "loss": 0.1118, + "num_tokens": 485803289.0, + "reward": 1.2767857313156128, + "reward_std": 0.4165794551372528, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.31335172057151794, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1028.165283203125, + "completions/mean_terminated_length": 757.361572265625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.1628043258217463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13693380628260118, + "kl": 0.0191650390625, + "learning_rate": 9.893523152344004e-07, + "loss": 0.1402, + "num_tokens": 486331811.0, + "reward": 1.2349331378936768, + "reward_std": 0.4471706449985504, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3088892102241516, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1021.6339721679688, + "completions/mean_terminated_length": 828.3395385742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16301742048905227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13890666175283908, + "kl": 0.01776123046875, + "learning_rate": 9.892798012123195e-07, + "loss": 0.0771, + "num_tokens": 486859775.0, + "reward": 1.3186384439468384, + "reward_std": 0.3490552306175232, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8900669813156128, + "rewards/tag_count_reward/std": 0.2366020530462265, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 995.9598388671875, + "completions/mean_terminated_length": 817.4151611328125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.16323051515635822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13509418464534523, + "kl": 0.018524169921875, + "learning_rate": 9.892070440824929e-07, + "loss": 0.0566, + "num_tokens": 487382957.0, + "reward": 1.403459906578064, + "reward_std": 0.4495825171470642, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8677455186843872, + "rewards/tag_count_reward/std": 0.2567932605743408, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1042.5379638671875, + "completions/mean_terminated_length": 803.6713256835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16344360982366415, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1319257598959288, + "kl": 0.01702880859375, + "learning_rate": 9.891340438851858e-07, + "loss": 0.148, + "num_tokens": 487922574.0, + "reward": 1.2890625, + "reward_std": 0.43297436833381653, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8426339030265808, + "rewards/tag_count_reward/std": 0.2883094549179077, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 960.2991333007812, + "completions/mean_terminated_length": 801.7340087890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1636567044909701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1376753544507727, + "kl": 0.019317626953125, + "learning_rate": 9.89060800660798e-07, + "loss": 0.0594, + "num_tokens": 488423508.0, + "reward": 1.4090402126312256, + "reward_std": 0.322277694940567, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8934151530265808, + "rewards/tag_count_reward/std": 0.2381325215101242, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 921.419677734375, + "completions/mean_terminated_length": 719.8211059570312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.16386979915827607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1381497842949636, + "kl": 0.020263671875, + "learning_rate": 9.889873144498646e-07, + "loss": 0.122, + "num_tokens": 488903904.0, + "reward": 1.4748884439468384, + "reward_std": 0.366472065448761, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8677455186843872, + "rewards/tag_count_reward/std": 0.26111283898353577, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1036.602783203125, + "completions/mean_terminated_length": 760.76708984375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.16408289382558203, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13640681850162104, + "kl": 0.017303466796875, + "learning_rate": 9.889135852930541e-07, + "loss": 0.1298, + "num_tokens": 489432094.0, + "reward": 1.422991156578064, + "reward_std": 0.40439870953559875, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8805803656578064, + "rewards/tag_count_reward/std": 0.2654026746749878, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 941.8504638671875, + "completions/mean_terminated_length": 740.467041015625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.16429598849288796, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1743098149080204, + "kl": 0.01934814453125, + "learning_rate": 9.888396132311703e-07, + "loss": 0.1406, + "num_tokens": 489919787.0, + "reward": 1.3889509439468384, + "reward_std": 0.404680460691452, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8621651530265808, + "rewards/tag_count_reward/std": 0.26829564571380615, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 956.029052734375, + "completions/mean_terminated_length": 790.4087524414062, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.16450908316019391, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14199491995229113, + "kl": 0.0177001953125, + "learning_rate": 9.887653983051506e-07, + "loss": 0.1274, + "num_tokens": 490418024.0, + "reward": 1.274553656578064, + "reward_std": 0.3672487139701843, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8482142686843872, + "rewards/tag_count_reward/std": 0.27089864015579224, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 992.7857666015625, + "completions/mean_terminated_length": 790.723388671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16472217782749987, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14502102409347226, + "kl": 0.018035888671875, + "learning_rate": 9.886909405560675e-07, + "loss": 0.1574, + "num_tokens": 490931112.0, + "reward": 1.3828126192092896, + "reward_std": 0.41127461194992065, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8470982313156128, + "rewards/tag_count_reward/std": 0.28536733984947205, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1010.4397583007812, + "completions/mean_terminated_length": 831.1754150390625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.16493527249480583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12931928876788112, + "kl": 0.01715087890625, + "learning_rate": 9.88616240025128e-07, + "loss": 0.0495, + "num_tokens": 491452253.0, + "reward": 1.4598214626312256, + "reward_std": 0.34742823243141174, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.49405214190483093, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8794642686843872, + "rewards/tag_count_reward/std": 0.2524646520614624, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1137.77685546875, + "completions/mean_terminated_length": 859.1370849609375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.16514836716211176, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.3383580850687435, + "kl": 0.0629730224609375, + "learning_rate": 9.885412967536728e-07, + "loss": 0.1246, + "num_tokens": 492050601.0, + "reward": 1.32421875, + "reward_std": 0.4323054254055023, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8376116156578064, + "rewards/tag_count_reward/std": 0.2910819947719574, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1064.915283203125, + "completions/mean_terminated_length": 888.9947509765625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.16536146182941772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12153622803148827, + "kl": 0.0170745849609375, + "learning_rate": 9.884661107831773e-07, + "loss": 0.0593, + "num_tokens": 492599155.0, + "reward": 1.368303656578064, + "reward_std": 0.3781953752040863, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8727678656578064, + "rewards/tag_count_reward/std": 0.2590542435646057, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1010.1473388671875, + "completions/mean_terminated_length": 798.1129150390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16557455649672367, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.18556961100941505, + "kl": 0.019195556640625, + "learning_rate": 9.883906821552514e-07, + "loss": 0.0872, + "num_tokens": 493115829.0, + "reward": 1.403459906578064, + "reward_std": 0.4156063497066498, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8900669813156128, + "rewards/tag_count_reward/std": 0.25808316469192505, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1048.727783203125, + "completions/mean_terminated_length": 903.0537109375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.16578765116402963, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12083756949635982, + "kl": 0.016815185546875, + "learning_rate": 9.883150109116386e-07, + "loss": 0.1142, + "num_tokens": 493653483.0, + "reward": 1.383928656578064, + "reward_std": 0.3617124855518341, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9017857313156128, + "rewards/tag_count_reward/std": 0.23851032555103302, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1033.154052734375, + "completions/mean_terminated_length": 815.8834838867188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.16600074583133556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13837038557885342, + "kl": 0.01910400390625, + "learning_rate": 9.882390970942176e-07, + "loss": 0.1002, + "num_tokens": 494182064.0, + "reward": 1.4257813692092896, + "reward_std": 0.43116581439971924, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.8722098469734192, + "rewards/tag_count_reward/std": 0.2659730017185211, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1079.94873046875, + "completions/mean_terminated_length": 875.8729858398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16621384049864152, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13287125394033297, + "kl": 0.01800537109375, + "learning_rate": 9.881629407450006e-07, + "loss": 0.073, + "num_tokens": 494744057.0, + "reward": 1.4001116752624512, + "reward_std": 0.32378655672073364, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8800223469734192, + "rewards/tag_count_reward/std": 0.25189971923828125, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1082.265625, + "completions/mean_terminated_length": 865.8988647460938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.16642693516594748, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12037159581499403, + "kl": 0.0162506103515625, + "learning_rate": 9.880865419061344e-07, + "loss": 0.1045, + "num_tokens": 495304544.0, + "reward": 1.2840402126312256, + "reward_std": 0.4162614941596985, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8643973469734192, + "rewards/tag_count_reward/std": 0.2771081030368805, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1050.140625, + "completions/mean_terminated_length": 836.5067749023438, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.16664002983325343, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2314128222723499, + "kl": 0.01947021484375, + "learning_rate": 9.880099006198998e-07, + "loss": 0.0892, + "num_tokens": 495848303.0, + "reward": 1.4006696939468384, + "reward_std": 0.3899260461330414, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.2321632206439972, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1118.55810546875, + "completions/mean_terminated_length": 778.5182495117188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.16685312450055936, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11867931289512722, + "kl": 0.0167999267578125, + "learning_rate": 9.879330169287121e-07, + "loss": 0.08, + "num_tokens": 496419369.0, + "reward": 1.3141741752624512, + "reward_std": 0.3667290508747101, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8699776530265808, + "rewards/tag_count_reward/std": 0.2659401297569275, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1125.1429443359375, + "completions/mean_terminated_length": 853.086669921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.16706621916786532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12625275680782078, + "kl": 0.01666259765625, + "learning_rate": 9.878558908751205e-07, + "loss": 0.1155, + "num_tokens": 497000329.0, + "reward": 1.3543527126312256, + "reward_std": 0.4074389636516571, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8364955186843872, + "rewards/tag_count_reward/std": 0.30225032567977905, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1078.5223388671875, + "completions/mean_terminated_length": 861.31689453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.16727931383517128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29144328051957286, + "kl": 0.0318603515625, + "learning_rate": 9.877785225018085e-07, + "loss": 0.1253, + "num_tokens": 497561491.0, + "reward": 1.2767857313156128, + "reward_std": 0.44960084557533264, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8683035969734192, + "rewards/tag_count_reward/std": 0.27265334129333496, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1149.19873046875, + "completions/mean_terminated_length": 820.368896484375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.16749240850247724, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11643618902357854, + "kl": 0.017364501953125, + "learning_rate": 9.87700911851593e-07, + "loss": 0.0868, + "num_tokens": 498145836.0, + "reward": 1.3599331378936768, + "reward_std": 0.42775994539260864, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8175223469734192, + "rewards/tag_count_reward/std": 0.31514689326286316, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1270.4754638671875, + "completions/mean_terminated_length": 949.1640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16770550316978317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1158507526117939, + "kl": 0.013671875, + "learning_rate": 9.87623058967426e-07, + "loss": 0.0857, + "num_tokens": 498782209.0, + "reward": 1.1891741752624512, + "reward_std": 0.4404595196247101, + "rewards/accuracy_reward/mean": 0.3504464328289032, + "rewards/accuracy_reward/std": 0.47764310240745544, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8387276530265808, + "rewards/tag_count_reward/std": 0.29598572850227356, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1185.7054443359375, + "completions/mean_terminated_length": 891.3892822265625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.16791859783708912, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11619686081921426, + "kl": 0.0176239013671875, + "learning_rate": 9.875449638923935e-07, + "loss": 0.0715, + "num_tokens": 499383533.0, + "reward": 1.2818081378936768, + "reward_std": 0.372024804353714, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8911830186843872, + "rewards/tag_count_reward/std": 0.25692933797836304, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1139.7366943359375, + "completions/mean_terminated_length": 875.3717041015625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.16813169250439508, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13274790507710013, + "kl": 0.015838623046875, + "learning_rate": 9.874666266697141e-07, + "loss": 0.0824, + "num_tokens": 499968391.0, + "reward": 1.407366156578064, + "reward_std": 0.3396851718425751, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8962053656578064, + "rewards/tag_count_reward/std": 0.24881619215011597, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1086.1317138671875, + "completions/mean_terminated_length": 895.8155517578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16834478717170104, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13580965646497772, + "kl": 0.0187225341796875, + "learning_rate": 9.873880473427424e-07, + "loss": 0.1234, + "num_tokens": 500525442.0, + "reward": 1.3465402126312256, + "reward_std": 0.3806202709674835, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8934151530265808, + "rewards/tag_count_reward/std": 0.2398875206708908, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1015.825927734375, + "completions/mean_terminated_length": 804.9515991210938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16855788183900697, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13901525481969168, + "kl": 0.01849365234375, + "learning_rate": 9.873092259549657e-07, + "loss": 0.1408, + "num_tokens": 501047876.0, + "reward": 1.3850446939468384, + "reward_std": 0.4454170763492584, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8694196343421936, + "rewards/tag_count_reward/std": 0.274212509393692, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1069.529052734375, + "completions/mean_terminated_length": 860.0460815429688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.16877097650631293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11783784715946118, + "kl": 0.01739501953125, + "learning_rate": 9.872301625500054e-07, + "loss": 0.0678, + "num_tokens": 501595665.0, + "reward": 1.4441964626312256, + "reward_std": 0.3454887568950653, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8928571343421936, + "rewards/tag_count_reward/std": 0.24682386219501495, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1114.102783203125, + "completions/mean_terminated_length": 810.171630859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16898407117361888, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11269875261426066, + "kl": 0.0170440673828125, + "learning_rate": 9.871508571716173e-07, + "loss": 0.0436, + "num_tokens": 502160975.0, + "reward": 1.3030134439468384, + "reward_std": 0.39404064416885376, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.21482694149017334, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1000.1897583007812, + "completions/mean_terminated_length": 758.3873901367188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.16919716584092484, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1316779247178525, + "kl": 0.020599365234375, + "learning_rate": 9.870713098636912e-07, + "loss": 0.0759, + "num_tokens": 502678196.0, + "reward": 1.4335938692092896, + "reward_std": 0.36590439081192017, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2757668197154999, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1012.4085083007812, + "completions/mean_terminated_length": 849.1757202148438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.16941026050823077, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1280866594894134, + "kl": 0.01824951171875, + "learning_rate": 9.869915206702495e-07, + "loss": 0.0971, + "num_tokens": 503198859.0, + "reward": 1.4944196939468384, + "reward_std": 0.41933438181877136, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8939732313156128, + "rewards/tag_count_reward/std": 0.24274127185344696, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 938.8928833007812, + "completions/mean_terminated_length": 780.448974609375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.16962335517553673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13960112741214545, + "kl": 0.019683837890625, + "learning_rate": 9.869114896354501e-07, + "loss": 0.1859, + "num_tokens": 503683099.0, + "reward": 1.4419643878936768, + "reward_std": 0.37365370988845825, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8861607313156128, + "rewards/tag_count_reward/std": 0.23264934122562408, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 954.810302734375, + "completions/mean_terminated_length": 795.4450073242188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.16983644984284268, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1416180253116865, + "kl": 0.0211181640625, + "learning_rate": 9.868312168035841e-07, + "loss": 0.0721, + "num_tokens": 504180086.0, + "reward": 1.5178571939468384, + "reward_std": 0.3867291510105133, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.22558781504631042, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1058.665283203125, + "completions/mean_terminated_length": 820.2382202148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17004954451014864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12907451550579208, + "kl": 0.01727294921875, + "learning_rate": 9.86750702219076e-07, + "loss": 0.1217, + "num_tokens": 504728944.0, + "reward": 1.31640625, + "reward_std": 0.3998180329799652, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8699776530265808, + "rewards/tag_count_reward/std": 0.2685560882091522, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 982.15185546875, + "completions/mean_terminated_length": 814.14990234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17026263917745457, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14113066617204376, + "kl": 0.01953125, + "learning_rate": 9.866699459264846e-07, + "loss": 0.0963, + "num_tokens": 505241796.0, + "reward": 1.5066964626312256, + "reward_std": 0.36500248312950134, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8995535969734192, + "rewards/tag_count_reward/std": 0.23521094024181366, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1133.5357666015625, + "completions/mean_terminated_length": 910.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17047573384476053, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11182911996434776, + "kl": 0.0133514404296875, + "learning_rate": 9.865889479705027e-07, + "loss": 0.1273, + "num_tokens": 505825028.0, + "reward": 1.3649554252624512, + "reward_std": 0.44971877336502075, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.24096561968326569, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1078.962158203125, + "completions/mean_terminated_length": 868.3016357421875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1706888285120665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13245310528949117, + "kl": 0.0164947509765625, + "learning_rate": 9.865077083959557e-07, + "loss": 0.098, + "num_tokens": 506379731.0, + "reward": 1.321428656578064, + "reward_std": 0.40782609581947327, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8772321343421936, + "rewards/tag_count_reward/std": 0.2696329653263092, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1006.85498046875, + "completions/mean_terminated_length": 807.4866943359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17090192317937244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13758957582685366, + "kl": 0.017608642578125, + "learning_rate": 9.864262272478043e-07, + "loss": 0.058, + "num_tokens": 506896114.0, + "reward": 1.4614956378936768, + "reward_std": 0.3501630127429962, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9079241156578064, + "rewards/tag_count_reward/std": 0.22752127051353455, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1085.680908203125, + "completions/mean_terminated_length": 866.85205078125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.17111501784667837, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15229395075382232, + "kl": 0.017822265625, + "learning_rate": 9.863445045711415e-07, + "loss": 0.1377, + "num_tokens": 507452883.0, + "reward": 1.3119419813156128, + "reward_std": 0.367929607629776, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.26011165976524353, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1033.9554443359375, + "completions/mean_terminated_length": 826.7849731445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17132811251398433, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12760782686145986, + "kl": 0.018646240234375, + "learning_rate": 9.862625404111947e-07, + "loss": 0.1117, + "num_tokens": 507987983.0, + "reward": 1.3950893878936768, + "reward_std": 0.3463027775287628, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.8995535969734192, + "rewards/tag_count_reward/std": 0.24281583726406097, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1096.078125, + "completions/mean_terminated_length": 829.5399780273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1715412071812903, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12176998301862253, + "kl": 0.017364501953125, + "learning_rate": 9.861803348133248e-07, + "loss": 0.091, + "num_tokens": 508549026.0, + "reward": 1.3844866752624512, + "reward_std": 0.3844606578350067, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8643973469734192, + "rewards/tag_count_reward/std": 0.2735532820224762, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1027.65185546875, + "completions/mean_terminated_length": 785.2486572265625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.17175430184859625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15780490669581076, + "kl": 0.019256591796875, + "learning_rate": 9.860978878230266e-07, + "loss": 0.0596, + "num_tokens": 509084406.0, + "reward": 1.2957589626312256, + "reward_std": 0.3426037132740021, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.24278241395950317, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1014.5335083007812, + "completions/mean_terminated_length": 842.2890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17196739651590218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13828243637365842, + "kl": 0.01934814453125, + "learning_rate": 9.860151994859277e-07, + "loss": 0.0881, + "num_tokens": 509607797.0, + "reward": 1.426897406578064, + "reward_std": 0.4110271632671356, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8889508843421936, + "rewards/tag_count_reward/std": 0.24707597494125366, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 961.77685546875, + "completions/mean_terminated_length": 780.7396240234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17218049118320813, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14546094836819282, + "kl": 0.0201416015625, + "learning_rate": 9.8593226984779e-07, + "loss": 0.0453, + "num_tokens": 510103521.0, + "reward": 1.383928656578064, + "reward_std": 0.25870847702026367, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9084821343421936, + "rewards/tag_count_reward/std": 0.22309480607509613, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 910.4241333007812, + "completions/mean_terminated_length": 767.5125732421875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.1723935858505141, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13986036976829633, + "kl": 0.02178955078125, + "learning_rate": 9.85849098954509e-07, + "loss": 0.0504, + "num_tokens": 510573743.0, + "reward": 1.5876116752624512, + "reward_std": 0.3486694395542145, + "rewards/accuracy_reward/mean": 0.6830357313156128, + "rewards/accuracy_reward/std": 0.4658135175704956, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9045758843421936, + "rewards/tag_count_reward/std": 0.2166612446308136, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1004.6295166015625, + "completions/mean_terminated_length": 827.55615234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17260668051782005, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14373156835951706, + "kl": 0.02008056640625, + "learning_rate": 9.857656868521128e-07, + "loss": 0.0967, + "num_tokens": 511092233.0, + "reward": 1.3258929252624512, + "reward_std": 0.29462239146232605, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9084821343421936, + "rewards/tag_count_reward/std": 0.21673686802387238, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 983.19873046875, + "completions/mean_terminated_length": 808.9584350585938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.17281977518512598, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14363248711857868, + "kl": 0.019317626953125, + "learning_rate": 9.85682033586764e-07, + "loss": 0.1254, + "num_tokens": 511609842.0, + "reward": 1.4547991752624512, + "reward_std": 0.3784177005290985, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8922991156578064, + "rewards/tag_count_reward/std": 0.24055272340774536, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1027.37060546875, + "completions/mean_terminated_length": 763.6123657226562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.17303286985243194, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15304008254762147, + "kl": 0.01873779296875, + "learning_rate": 9.855981392047582e-07, + "loss": 0.0761, + "num_tokens": 512141128.0, + "reward": 1.3560268878936768, + "reward_std": 0.34131333231925964, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.27262356877326965, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1077.279052734375, + "completions/mean_terminated_length": 885.2112426757812, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1732459645197379, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1255546197262071, + "kl": 0.018829345703125, + "learning_rate": 9.855140037525246e-07, + "loss": 0.105, + "num_tokens": 512706277.0, + "reward": 1.4810268878936768, + "reward_std": 0.38749974966049194, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.4921026825904846, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8895089030265808, + "rewards/tag_count_reward/std": 0.2564849555492401, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 938.69873046875, + "completions/mean_terminated_length": 726.2792358398438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.17345905918704385, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13838736473928231, + "kl": 0.01861572265625, + "learning_rate": 9.854296272766258e-07, + "loss": 0.0565, + "num_tokens": 513191582.0, + "reward": 1.4988839626312256, + "reward_std": 0.3418709635734558, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.22692033648490906, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1000.1250610351562, + "completions/mean_terminated_length": 786.0430297851562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17367215385434978, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12841674594571809, + "kl": 0.018890380859375, + "learning_rate": 9.853450098237576e-07, + "loss": 0.0961, + "num_tokens": 513711494.0, + "reward": 1.3811384439468384, + "reward_std": 0.3607409596443176, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.2191363126039505, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1043.165283203125, + "completions/mean_terminated_length": 824.7228393554688, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.17388524852165574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12796800266321556, + "kl": 0.018310546875, + "learning_rate": 9.852601514407492e-07, + "loss": 0.0883, + "num_tokens": 514241472.0, + "reward": 1.4910714626312256, + "reward_std": 0.4593278467655182, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8861607313156128, + "rewards/tag_count_reward/std": 0.2511458992958069, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 817.3013916015625, + "completions/mean_terminated_length": 709.7645874023438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1740983431889617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22187650620338542, + "kl": 0.0279541015625, + "learning_rate": 9.851750521745631e-07, + "loss": 0.0593, + "num_tokens": 514675687.0, + "reward": 1.5446429252624512, + "reward_std": 0.2849263548851013, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.17457373440265656, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 935.9710083007812, + "completions/mean_terminated_length": 760.68994140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.17431143785626765, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1455488867155865, + "kl": 0.0235595703125, + "learning_rate": 9.850897120722958e-07, + "loss": 0.0711, + "num_tokens": 515163258.0, + "reward": 1.5641741752624512, + "reward_std": 0.35682252049446106, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.21243098378181458, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 983.1428833007812, + "completions/mean_terminated_length": 795.884521484375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1745245325235736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13117668369356553, + "kl": 0.018157958984375, + "learning_rate": 9.85004131181176e-07, + "loss": 0.102, + "num_tokens": 515667386.0, + "reward": 1.4726563692092896, + "reward_std": 0.40249624848365784, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8989955186843872, + "rewards/tag_count_reward/std": 0.24229536950588226, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1097.5335693359375, + "completions/mean_terminated_length": 795.62060546875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.17473762719087954, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.9049417323035682, + "kl": 0.0238037109375, + "learning_rate": 9.849183095485662e-07, + "loss": 0.1036, + "num_tokens": 516232585.0, + "reward": 1.3510044813156128, + "reward_std": 0.4079856276512146, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8934151530265808, + "rewards/tag_count_reward/std": 0.24393343925476074, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1068.0067138671875, + "completions/mean_terminated_length": 870.9571533203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1749507218581855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12649692216588773, + "kl": 0.017791748046875, + "learning_rate": 9.848322472219625e-07, + "loss": 0.0822, + "num_tokens": 516787052.0, + "reward": 1.3805804252624512, + "reward_std": 0.36983272433280945, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.25340983271598816, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 934.1607666015625, + "completions/mean_terminated_length": 824.9608154296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17516381652549146, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13396193991790273, + "kl": 0.01947021484375, + "learning_rate": 9.847459442489933e-07, + "loss": 0.0602, + "num_tokens": 517267780.0, + "reward": 1.6216518878936768, + "reward_std": 0.2967464029788971, + "rewards/accuracy_reward/mean": 0.7165178656578064, + "rewards/accuracy_reward/std": 0.4511922299861908, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.2235727310180664, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 981.5491333007812, + "completions/mean_terminated_length": 838.4556884765625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1753769111927974, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13399583815501598, + "kl": 0.019439697265625, + "learning_rate": 9.846594006774207e-07, + "loss": 0.1079, + "num_tokens": 517771258.0, + "reward": 1.4704241752624512, + "reward_std": 0.3607228398323059, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8967633843421936, + "rewards/tag_count_reward/std": 0.24018916487693787, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1054.446533203125, + "completions/mean_terminated_length": 841.7344360351562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17559000586010334, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.129341305739734, + "kl": 0.017364501953125, + "learning_rate": 9.845726165551406e-07, + "loss": 0.119, + "num_tokens": 518315250.0, + "reward": 1.3973214626312256, + "reward_std": 0.4071505665779114, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.26121383905410767, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1204.93310546875, + "completions/mean_terminated_length": 975.0057373046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1758031005274093, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12304783085405822, + "kl": 0.016998291015625, + "learning_rate": 9.844855919301809e-07, + "loss": 0.1033, + "num_tokens": 518922836.0, + "reward": 1.2678571939468384, + "reward_std": 0.403646320104599, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8816964030265808, + "rewards/tag_count_reward/std": 0.26747602224349976, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 996.0870971679688, + "completions/mean_terminated_length": 763.9209594726562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17601619519471526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15660129548662077, + "kl": 0.01922607421875, + "learning_rate": 9.843983268507028e-07, + "loss": 0.0898, + "num_tokens": 519440171.0, + "reward": 1.3560268878936768, + "reward_std": 0.35104814171791077, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8917410969734192, + "rewards/tag_count_reward/std": 0.24290579557418823, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 934.3616333007812, + "completions/mean_terminated_length": 794.457275390625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.17622928986202122, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1406009542567803, + "kl": 0.01898193359375, + "learning_rate": 9.843108213650013e-07, + "loss": 0.0998, + "num_tokens": 519924813.0, + "reward": 1.3515626192092896, + "reward_std": 0.31163233518600464, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.2222510427236557, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1191.16748046875, + "completions/mean_terminated_length": 944.951171875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.17644238452932715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11624911417911378, + "kl": 0.0152435302734375, + "learning_rate": 9.84223075521504e-07, + "loss": 0.0562, + "num_tokens": 520531000.0, + "reward": 1.28125, + "reward_std": 0.3583948314189911, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8973214030265808, + "rewards/tag_count_reward/std": 0.24815554916858673, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1110.477783203125, + "completions/mean_terminated_length": 834.0982666015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1766554791966331, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13957798308123592, + "kl": 0.01776123046875, + "learning_rate": 9.841350893687712e-07, + "loss": 0.0982, + "num_tokens": 521097742.0, + "reward": 1.2840402126312256, + "reward_std": 0.4644352197647095, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8532366156578064, + "rewards/tag_count_reward/std": 0.2878517210483551, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1108.9910888671875, + "completions/mean_terminated_length": 828.6492919921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17686857386393906, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3062296208053762, + "kl": 0.03692626953125, + "learning_rate": 9.840468629554968e-07, + "loss": 0.0827, + "num_tokens": 521661482.0, + "reward": 1.4006696939468384, + "reward_std": 0.3762128949165344, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.26315414905548096, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1021.2656860351562, + "completions/mean_terminated_length": 868.5718383789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17708166853124502, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12098574812589716, + "kl": 0.0179443359375, + "learning_rate": 9.839583963305068e-07, + "loss": 0.1055, + "num_tokens": 522187905.0, + "reward": 1.5496652126312256, + "reward_std": 0.3815498650074005, + "rewards/accuracy_reward/mean": 0.6696428656578064, + "rewards/accuracy_reward/std": 0.47086748480796814, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8800223469734192, + "rewards/tag_count_reward/std": 0.26009246706962585, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1015.7500610351562, + "completions/mean_terminated_length": 811.508056640625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.17729476319855095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12536385488903412, + "kl": 0.0174560546875, + "learning_rate": 9.838696895427614e-07, + "loss": 0.0773, + "num_tokens": 522710913.0, + "reward": 1.4642857313156128, + "reward_std": 0.3339889347553253, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9017857313156128, + "rewards/tag_count_reward/std": 0.23197223246097565, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 986.8147583007812, + "completions/mean_terminated_length": 796.91845703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1775078578658569, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14978774051107172, + "kl": 0.019805908203125, + "learning_rate": 9.837807426413526e-07, + "loss": 0.1088, + "num_tokens": 523225534.0, + "reward": 1.5290179252624512, + "reward_std": 0.3605847954750061, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9017857313156128, + "rewards/tag_count_reward/std": 0.23015686869621277, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1119.1116943359375, + "completions/mean_terminated_length": 892.050048828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17772095253316286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13096546853962232, + "kl": 0.01715087890625, + "learning_rate": 9.83691555675506e-07, + "loss": 0.1229, + "num_tokens": 523797680.0, + "reward": 1.4017857313156128, + "reward_std": 0.440616637468338, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8973214030265808, + "rewards/tag_count_reward/std": 0.23955488204956055, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1011.8638916015625, + "completions/mean_terminated_length": 842.3142700195312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.17793404720046882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1429841387124913, + "kl": 0.0174560546875, + "learning_rate": 9.836021286945794e-07, + "loss": 0.1207, + "num_tokens": 524317875.0, + "reward": 1.3470982313156128, + "reward_std": 0.33667925000190735, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.49717557430267334, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.2309555560350418, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1039.743408203125, + "completions/mean_terminated_length": 849.859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17814714186777475, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1265815492536423, + "kl": 0.019500732421875, + "learning_rate": 9.835124617480643e-07, + "loss": 0.1154, + "num_tokens": 524855072.0, + "reward": 1.3978794813156128, + "reward_std": 0.3377029001712799, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.2326003611087799, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1016.7879638671875, + "completions/mean_terminated_length": 778.8159790039062, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1783602365350807, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15091653024462656, + "kl": 0.01971435546875, + "learning_rate": 9.834225548855838e-07, + "loss": 0.1042, + "num_tokens": 525373729.0, + "reward": 1.4793527126312256, + "reward_std": 0.3538217842578888, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8856026530265808, + "rewards/tag_count_reward/std": 0.2544882893562317, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1138.060302734375, + "completions/mean_terminated_length": 918.7672729492188, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.17857333120238666, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11816602252606195, + "kl": 0.017120361328125, + "learning_rate": 9.83332408156895e-07, + "loss": 0.0638, + "num_tokens": 525955596.0, + "reward": 1.477678656578064, + "reward_std": 0.39377862215042114, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.23201528191566467, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1125.7054443359375, + "completions/mean_terminated_length": 853.8150024414062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17878642586969262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14082054647841116, + "kl": 0.01812744140625, + "learning_rate": 9.832420216118871e-07, + "loss": 0.1132, + "num_tokens": 526534872.0, + "reward": 1.2444196939468384, + "reward_std": 0.41731616854667664, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.28232377767562866, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1134.384033203125, + "completions/mean_terminated_length": 904.703857421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17899952053699855, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1341890070250784, + "kl": 0.020111083984375, + "learning_rate": 9.83151395300582e-07, + "loss": 0.0891, + "num_tokens": 527115316.0, + "reward": 1.4285714626312256, + "reward_std": 0.31156718730926514, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8973214030265808, + "rewards/tag_count_reward/std": 0.231843039393425, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1100.265625, + "completions/mean_terminated_length": 909.7024536132812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.1792126152043045, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11540806408311374, + "kl": 0.017669677734375, + "learning_rate": 9.830605292731347e-07, + "loss": 0.0534, + "num_tokens": 527673387.0, + "reward": 1.4469866752624512, + "reward_std": 0.3376753032207489, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.21483854949474335, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1135.040283203125, + "completions/mean_terminated_length": 908.70751953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17942570987161047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12256690693632091, + "kl": 0.017242431640625, + "learning_rate": 9.829694235798323e-07, + "loss": 0.0964, + "num_tokens": 528255453.0, + "reward": 1.4592634439468384, + "reward_std": 0.4008312523365021, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.2595735788345337, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1093.1295166015625, + "completions/mean_terminated_length": 815.1988525390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.17963880453891642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39184714419617994, + "kl": 0.02716064453125, + "learning_rate": 9.828780782710948e-07, + "loss": 0.0838, + "num_tokens": 528811655.0, + "reward": 1.3030134439468384, + "reward_std": 0.37335294485092163, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2517460286617279, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1103.96435546875, + "completions/mean_terminated_length": 839.63427734375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.17985189920622235, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12563362753294519, + "kl": 0.018310546875, + "learning_rate": 9.827864933974753e-07, + "loss": 0.0906, + "num_tokens": 529381671.0, + "reward": 1.28125, + "reward_std": 0.3379424214363098, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8816964030265808, + "rewards/tag_count_reward/std": 0.253520667552948, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1088.3013916015625, + "completions/mean_terminated_length": 794.5160522460938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1800649938735283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14116412359340935, + "kl": 0.01715087890625, + "learning_rate": 9.826946690096583e-07, + "loss": 0.1054, + "num_tokens": 529936574.0, + "reward": 1.454241156578064, + "reward_std": 0.3952125012874603, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.26002347469329834, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1049.4754638671875, + "completions/mean_terminated_length": 838.9757080078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18027808854083427, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1323145174629971, + "kl": 0.018157958984375, + "learning_rate": 9.826026051584622e-07, + "loss": 0.1141, + "num_tokens": 530469683.0, + "reward": 1.4135044813156128, + "reward_std": 0.3943188786506653, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8577008843421936, + "rewards/tag_count_reward/std": 0.2832612991333008, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1028.622802734375, + "completions/mean_terminated_length": 846.2079467773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.18049118320814023, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1390860132543264, + "kl": 0.0186767578125, + "learning_rate": 9.825103018948368e-07, + "loss": 0.0531, + "num_tokens": 530996042.0, + "reward": 1.4207589626312256, + "reward_std": 0.32364344596862793, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390644073486, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.21276935935020447, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1062.493408203125, + "completions/mean_terminated_length": 892.2225341796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18070427787544616, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12394130330994002, + "kl": 0.01898193359375, + "learning_rate": 9.824177592698654e-07, + "loss": 0.0466, + "num_tokens": 531539255.0, + "reward": 1.4720982313156128, + "reward_std": 0.3249511122703552, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.22599419951438904, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1134.888427734375, + "completions/mean_terminated_length": 812.1268920898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18091737254275211, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1324424655918439, + "kl": 0.015594482421875, + "learning_rate": 9.823249773347629e-07, + "loss": 0.0989, + "num_tokens": 532122085.0, + "reward": 1.2410714626312256, + "reward_std": 0.34604308009147644, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9017857313156128, + "rewards/tag_count_reward/std": 0.23792338371276855, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 988.3214721679688, + "completions/mean_terminated_length": 827.5989990234375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.18113046721005807, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14527627770587262, + "kl": 0.019622802734375, + "learning_rate": 9.822319561408772e-07, + "loss": 0.0967, + "num_tokens": 532632341.0, + "reward": 1.512834906578064, + "reward_std": 0.3208707869052887, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.18335527181625366, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 928.04248046875, + "completions/mean_terminated_length": 771.3053588867188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.18134356187736403, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.17837806822699176, + "kl": 0.02435302734375, + "learning_rate": 9.821386957396882e-07, + "loss": 0.0942, + "num_tokens": 533118312.0, + "reward": 1.5580357313156128, + "reward_std": 0.31677085161209106, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.21902872622013092, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1080.390625, + "completions/mean_terminated_length": 876.4081420898438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.18155665654466996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14599819769459818, + "kl": 0.018035888671875, + "learning_rate": 9.820451961828085e-07, + "loss": 0.1153, + "num_tokens": 533675303.0, + "reward": 1.3989956378936768, + "reward_std": 0.37759333848953247, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.2138717770576477, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1081.825927734375, + "completions/mean_terminated_length": 855.5867919921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18176975121197592, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14042897147515412, + "kl": 0.0163726806640625, + "learning_rate": 9.81951457521983e-07, + "loss": 0.0989, + "num_tokens": 534226873.0, + "reward": 1.368303656578064, + "reward_std": 0.3966476023197174, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8794642686843872, + "rewards/tag_count_reward/std": 0.26171037554740906, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1140.2545166015625, + "completions/mean_terminated_length": 905.6685791015625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.18198284587928187, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1287666765472608, + "kl": 0.0171966552734375, + "learning_rate": 9.81857479809089e-07, + "loss": 0.07, + "num_tokens": 534814299.0, + "reward": 1.3844866752624512, + "reward_std": 0.38776344060897827, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9068080186843872, + "rewards/tag_count_reward/std": 0.23612141609191895, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1116.2567138671875, + "completions/mean_terminated_length": 848.5143432617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18219594054658783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13093682328142742, + "kl": 0.016754150390625, + "learning_rate": 9.817632630961354e-07, + "loss": 0.1137, + "num_tokens": 535377678.0, + "reward": 1.3872768878936768, + "reward_std": 0.3309180438518524, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8805803656578064, + "rewards/tag_count_reward/std": 0.2584632337093353, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1108.6317138671875, + "completions/mean_terminated_length": 891.8544311523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18240903521389376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12030697921826104, + "kl": 0.01666259765625, + "learning_rate": 9.816688074352645e-07, + "loss": 0.1062, + "num_tokens": 535941961.0, + "reward": 1.3247768878936768, + "reward_std": 0.36762282252311707, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.24038465321063995, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1039.4107666015625, + "completions/mean_terminated_length": 846.2765502929688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18262212988119972, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12695347725929787, + "kl": 0.01788330078125, + "learning_rate": 9.815741128787503e-07, + "loss": 0.1243, + "num_tokens": 536472001.0, + "reward": 1.4157366752624512, + "reward_std": 0.3780704736709595, + "rewards/accuracy_reward/mean": 0.5532407164573669, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8822544813156128, + "rewards/tag_count_reward/std": 0.25733718276023865, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1013.388427734375, + "completions/mean_terminated_length": 818.5410766601562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.18283522454850568, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.5118013329070157, + "kl": 0.05303955078125, + "learning_rate": 9.814791794789986e-07, + "loss": 0.0213, + "num_tokens": 536994319.0, + "reward": 1.4045759439468384, + "reward_std": 0.28441864252090454, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.2086479365825653, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1102.946533203125, + "completions/mean_terminated_length": 841.7777709960938, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.18304831921581163, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13643183175576157, + "kl": 0.019287109375, + "learning_rate": 9.81384007288548e-07, + "loss": 0.0525, + "num_tokens": 537554583.0, + "reward": 1.469866156578064, + "reward_std": 0.3209141492843628, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.21987920999526978, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 973.8013916015625, + "completions/mean_terminated_length": 820.3443603515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18326141388311756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1365111540687456, + "kl": 0.018890380859375, + "learning_rate": 9.81288596360069e-07, + "loss": 0.0998, + "num_tokens": 538061934.0, + "reward": 1.505022406578064, + "reward_std": 0.4115324914455414, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21634988486766815, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1032.868408203125, + "completions/mean_terminated_length": 808.8201293945312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.18347450855042352, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.135029607527849, + "kl": 0.01715087890625, + "learning_rate": 9.811929467463644e-07, + "loss": 0.0942, + "num_tokens": 538594979.0, + "reward": 1.501116156578064, + "reward_std": 0.38599494099617004, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8939732313156128, + "rewards/tag_count_reward/std": 0.25673794746398926, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1087.6875, + "completions/mean_terminated_length": 856.2548217773438, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.18368760321772948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13104428429302495, + "kl": 0.01885986328125, + "learning_rate": 9.810970585003686e-07, + "loss": 0.0744, + "num_tokens": 539146199.0, + "reward": 1.3476563692092896, + "reward_std": 0.3571614623069763, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8900669813156128, + "rewards/tag_count_reward/std": 0.24870266020298004, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1120.85498046875, + "completions/mean_terminated_length": 874.663818359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18390069788503544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12790363551308906, + "kl": 0.01629638671875, + "learning_rate": 9.810009316751487e-07, + "loss": 0.1049, + "num_tokens": 539712694.0, + "reward": 1.3102679252624512, + "reward_std": 0.40214911103248596, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9017857313156128, + "rewards/tag_count_reward/std": 0.24430227279663086, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1103.1116943359375, + "completions/mean_terminated_length": 888.24658203125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.18411379255234137, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11861042092319073, + "kl": 0.0177001953125, + "learning_rate": 9.809045663239033e-07, + "loss": 0.0826, + "num_tokens": 540280600.0, + "reward": 1.2678571939468384, + "reward_std": 0.35042956471443176, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.8883928656578064, + "rewards/tag_count_reward/std": 0.25380611419677734, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1117.5960693359375, + "completions/mean_terminated_length": 843.3150024414062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.18432688721964732, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1408078647826975, + "kl": 0.01519775390625, + "learning_rate": 9.808079624999634e-07, + "loss": 0.0936, + "num_tokens": 540852659.0, + "reward": 1.3186384439468384, + "reward_std": 0.37147876620292664, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.23275060951709747, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1048.404052734375, + "completions/mean_terminated_length": 853.8159790039062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.18453998188695328, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1489224783461349, + "kl": 0.018402099609375, + "learning_rate": 9.807111202567919e-07, + "loss": 0.0713, + "num_tokens": 541389304.0, + "reward": 1.4804688692092896, + "reward_std": 0.29375186562538147, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.22353574633598328, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1069.7857666015625, + "completions/mean_terminated_length": 885.5596923828125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.18475307655425924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12622876198319827, + "kl": 0.0172882080078125, + "learning_rate": 9.806140396479834e-07, + "loss": 0.0742, + "num_tokens": 541944744.0, + "reward": 1.4202009439468384, + "reward_std": 0.4015435576438904, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21197210252285004, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 994.6495971679688, + "completions/mean_terminated_length": 786.232666015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18496617122156517, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14702530850785664, + "kl": 0.0198974609375, + "learning_rate": 9.805167207272647e-07, + "loss": 0.1048, + "num_tokens": 542459355.0, + "reward": 1.4988839626312256, + "reward_std": 0.38961467146873474, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8917410969734192, + "rewards/tag_count_reward/std": 0.24348074197769165, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1195.2076416015625, + "completions/mean_terminated_length": 900.69970703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18517926588887113, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12416011425409289, + "kl": 0.018829345703125, + "learning_rate": 9.804191635484942e-07, + "loss": 0.0916, + "num_tokens": 543065592.0, + "reward": 1.3364956378936768, + "reward_std": 0.37175387144088745, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8989955186843872, + "rewards/tag_count_reward/std": 0.23107106983661652, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1156.3773193359375, + "completions/mean_terminated_length": 922.7971801757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18539236055617708, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1269630574523989, + "kl": 0.0179443359375, + "learning_rate": 9.803213681656627e-07, + "loss": 0.0783, + "num_tokens": 543658065.0, + "reward": 1.3297991752624512, + "reward_std": 0.35142168402671814, + "rewards/accuracy_reward/mean": 0.43518519401550293, + "rewards/accuracy_reward/std": 0.4963560700416565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.23024296760559082, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1067.83935546875, + "completions/mean_terminated_length": 857.9945678710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18560545522348304, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13005272770798165, + "kl": 0.01690673828125, + "learning_rate": 9.80223334632892e-07, + "loss": 0.0857, + "num_tokens": 544206473.0, + "reward": 1.4743304252624512, + "reward_std": 0.4461269676685333, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.23583374917507172, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 951.2545166015625, + "completions/mean_terminated_length": 778.3824462890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18581854989078897, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14486847202469041, + "kl": 0.0208740234375, + "learning_rate": 9.801250630044362e-07, + "loss": 0.0776, + "num_tokens": 544700619.0, + "reward": 1.462053656578064, + "reward_std": 0.3894461393356323, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20557457208633423, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 943.1295166015625, + "completions/mean_terminated_length": 788.5038452148438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.18603164455809493, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12596267179736972, + "kl": 0.0201416015625, + "learning_rate": 9.800265533346816e-07, + "loss": 0.0871, + "num_tokens": 545190581.0, + "reward": 1.5145089626312256, + "reward_std": 0.2981497049331665, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.18416117131710052, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1070.38623046875, + "completions/mean_terminated_length": 867.4851684570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18624473922540088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12821212235338597, + "kl": 0.017913818359375, + "learning_rate": 9.799278056781453e-07, + "loss": 0.0893, + "num_tokens": 545739970.0, + "reward": 1.4218751192092896, + "reward_std": 0.37049028277397156, + "rewards/accuracy_reward/mean": 0.5462962985038757, + "rewards/accuracy_reward/std": 0.49842923879623413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8950892686843872, + "rewards/tag_count_reward/std": 0.24437379837036133, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1072.1473388671875, + "completions/mean_terminated_length": 850.2410888671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18645783389270684, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1297002027899391, + "kl": 0.017608642578125, + "learning_rate": 9.798288200894768e-07, + "loss": 0.0828, + "num_tokens": 546282980.0, + "reward": 1.4960938692092896, + "reward_std": 0.3387187719345093, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21891970932483673, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 916.1875610351562, + "completions/mean_terminated_length": 764.3240356445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18667092856001277, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12635301300063642, + "kl": 0.01837158203125, + "learning_rate": 9.79729596623457e-07, + "loss": 0.0404, + "num_tokens": 546763064.0, + "reward": 1.5172991752624512, + "reward_std": 0.3567131757736206, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.494759202003479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.17069678008556366, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1070.44873046875, + "completions/mean_terminated_length": 857.9375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.18688402322731873, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1408906467106944, + "kl": 0.019439697265625, + "learning_rate": 9.796301353349984e-07, + "loss": 0.158, + "num_tokens": 547308433.0, + "reward": 1.2896206378936768, + "reward_std": 0.41165655851364136, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8989955186843872, + "rewards/tag_count_reward/std": 0.24630171060562134, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1160.93310546875, + "completions/mean_terminated_length": 962.1912231445312, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.1870971178946247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11879042075744604, + "kl": 0.016815185546875, + "learning_rate": 9.795304362791454e-07, + "loss": 0.0542, + "num_tokens": 547902819.0, + "reward": 1.4224331378936768, + "reward_std": 0.40214285254478455, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8911830186843872, + "rewards/tag_count_reward/std": 0.2394656240940094, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1106.8192138671875, + "completions/mean_terminated_length": 836.3649291992188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.18731021256193064, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11689074486762376, + "kl": 0.015960693359375, + "learning_rate": 9.794304995110735e-07, + "loss": 0.0869, + "num_tokens": 548478498.0, + "reward": 1.3314732313156128, + "reward_std": 0.3834100663661957, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.24327555298805237, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 940.9933471679688, + "completions/mean_terminated_length": 721.9598999023438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1875233072292366, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1790509872819405, + "kl": 0.02001953125, + "learning_rate": 9.793303250860904e-07, + "loss": 0.1795, + "num_tokens": 548972655.0, + "reward": 1.4882813692092896, + "reward_std": 0.3501301109790802, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.22040872275829315, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1010.1138916015625, + "completions/mean_terminated_length": 781.0435791015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18773640189654253, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13705385111251822, + "kl": 0.0194091796875, + "learning_rate": 9.792299130596346e-07, + "loss": 0.082, + "num_tokens": 549487058.0, + "reward": 1.4129464626312256, + "reward_std": 0.3497004806995392, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.20586584508419037, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 967.5491333007812, + "completions/mean_terminated_length": 819.4669799804688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1879494965638485, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.4321274233585489, + "kl": 0.05194091796875, + "learning_rate": 9.791292634872767e-07, + "loss": 0.1322, + "num_tokens": 549982408.0, + "reward": 1.4748884439468384, + "reward_std": 0.37381643056869507, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9034598469734192, + "rewards/tag_count_reward/std": 0.22689488530158997, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1173.1473388671875, + "completions/mean_terminated_length": 901.994140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18816259123115445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1228861490247945, + "kl": 0.01837158203125, + "learning_rate": 9.790283764247187e-07, + "loss": 0.0979, + "num_tokens": 550570858.0, + "reward": 1.3727679252624512, + "reward_std": 0.39497649669647217, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8549107313156128, + "rewards/tag_count_reward/std": 0.28797370195388794, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1064.638427734375, + "completions/mean_terminated_length": 894.2099609375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1883756858984604, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14522509556413055, + "kl": 0.022125244140625, + "learning_rate": 9.789272519277936e-07, + "loss": 0.0957, + "num_tokens": 551118744.0, + "reward": 1.4983259439468384, + "reward_std": 0.3826301693916321, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9045758843421936, + "rewards/tag_count_reward/std": 0.23343613743782043, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1117.4710693359375, + "completions/mean_terminated_length": 832.6151733398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18858878056576633, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1268958739566984, + "kl": 0.0168304443359375, + "learning_rate": 9.78825890052466e-07, + "loss": 0.1229, + "num_tokens": 551695339.0, + "reward": 1.3353794813156128, + "reward_std": 0.41855767369270325, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8666294813156128, + "rewards/tag_count_reward/std": 0.27157923579216003, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1146.274658203125, + "completions/mean_terminated_length": 873.6598510742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1888018752330723, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1197987332931469, + "kl": 0.016204833984375, + "learning_rate": 9.787242908548323e-07, + "loss": 0.0902, + "num_tokens": 552279366.0, + "reward": 1.4246652126312256, + "reward_std": 0.33839982748031616, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8755580186843872, + "rewards/tag_count_reward/std": 0.26546087861061096, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1028.9285888671875, + "completions/mean_terminated_length": 843.3984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18901496990037825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14030611238550697, + "kl": 0.018218994140625, + "learning_rate": 9.786224543911195e-07, + "loss": 0.1151, + "num_tokens": 552812454.0, + "reward": 1.5150669813156128, + "reward_std": 0.40816301107406616, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8878348469734192, + "rewards/tag_count_reward/std": 0.2384992092847824, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1050.1741943359375, + "completions/mean_terminated_length": 816.5234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1892280645676842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12866111395369745, + "kl": 0.01824951171875, + "learning_rate": 9.785203807176864e-07, + "loss": 0.1062, + "num_tokens": 553347540.0, + "reward": 1.4676339626312256, + "reward_std": 0.3957492411136627, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9029017686843872, + "rewards/tag_count_reward/std": 0.24360375106334686, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1006.6116333007812, + "completions/mean_terminated_length": 759.2099609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.18944115923499014, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11797656626089308, + "kl": 0.017852783203125, + "learning_rate": 9.78418069891023e-07, + "loss": 0.0714, + "num_tokens": 553862438.0, + "reward": 1.407366156578064, + "reward_std": 0.33798545598983765, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.22366204857826233, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1233.5535888671875, + "completions/mean_terminated_length": 977.994140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1896542539022961, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11769349785186133, + "kl": 0.0158538818359375, + "learning_rate": 9.783155219677505e-07, + "loss": 0.1184, + "num_tokens": 554486814.0, + "reward": 1.3850446939468384, + "reward_std": 0.4460518956184387, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8582589030265808, + "rewards/tag_count_reward/std": 0.29347658157348633, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 880.93310546875, + "completions/mean_terminated_length": 703.9228515625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.18986734856960205, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15428443063737868, + "kl": 0.0216064453125, + "learning_rate": 9.782127370046216e-07, + "loss": 0.1058, + "num_tokens": 554943648.0, + "reward": 1.5552456378936768, + "reward_std": 0.3522113561630249, + "rewards/accuracy_reward/mean": 0.6666666865348816, + "rewards/accuracy_reward/std": 0.47195106744766235, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9123883843421936, + "rewards/tag_count_reward/std": 0.2286706119775772, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 975.3013916015625, + "completions/mean_terminated_length": 803.0025634765625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.190080443236908, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.712524232452467, + "kl": 0.04974365234375, + "learning_rate": 9.781097150585194e-07, + "loss": 0.0628, + "num_tokens": 555459751.0, + "reward": 1.4107143878936768, + "reward_std": 0.36250707507133484, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.21494385600090027, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 919.9598388671875, + "completions/mean_terminated_length": 748.868896484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19029353790421394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15002837168625757, + "kl": 0.01947021484375, + "learning_rate": 9.780064561864592e-07, + "loss": 0.1241, + "num_tokens": 555938453.0, + "reward": 1.4559152126312256, + "reward_std": 0.33153530955314636, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.182987242937088, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 914.30810546875, + "completions/mean_terminated_length": 732.21240234375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1905066325715199, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13213480531095445, + "kl": 0.0205078125, + "learning_rate": 9.779029604455863e-07, + "loss": 0.0602, + "num_tokens": 556418751.0, + "reward": 1.4469866752624512, + "reward_std": 0.30496519804000854, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995505809783936, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.19061920046806335, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 960.4910888671875, + "completions/mean_terminated_length": 811.4415893554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19071972723882585, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13350655927455168, + "kl": 0.021453857421875, + "learning_rate": 9.777992278931783e-07, + "loss": 0.0596, + "num_tokens": 556918571.0, + "reward": 1.4715402126312256, + "reward_std": 0.31978631019592285, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.18212558329105377, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1126.65185546875, + "completions/mean_terminated_length": 888.5505981445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1909328219061318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15110988658421268, + "kl": 0.01708984375, + "learning_rate": 9.776952585866427e-07, + "loss": 0.1496, + "num_tokens": 557491199.0, + "reward": 1.3325893878936768, + "reward_std": 0.38315898180007935, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8883928656578064, + "rewards/tag_count_reward/std": 0.24879863858222961, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 997.513427734375, + "completions/mean_terminated_length": 786.28955078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.19114591657343774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13878204615663806, + "kl": 0.022125244140625, + "learning_rate": 9.775910525835188e-07, + "loss": 0.0717, + "num_tokens": 558004741.0, + "reward": 1.4888393878936768, + "reward_std": 0.33801183104515076, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8973214030265808, + "rewards/tag_count_reward/std": 0.23720870912075043, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1021.3504638671875, + "completions/mean_terminated_length": 850.2421875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.1913590112407437, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 218.12230754374687, + "kl": 8.798553466796875, + "learning_rate": 9.774866099414765e-07, + "loss": 0.4774, + "num_tokens": 558534578.0, + "reward": 1.559709906578064, + "reward_std": 0.3949979543685913, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.23508284986019135, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 925.4063110351562, + "completions/mean_terminated_length": 731.4502563476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19157210590804966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15162419257587623, + "kl": 0.01983642578125, + "learning_rate": 9.773819307183168e-07, + "loss": 0.0825, + "num_tokens": 559017704.0, + "reward": 1.4704241752624512, + "reward_std": 0.3757508397102356, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.2062104493379593, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 983.263427734375, + "completions/mean_terminated_length": 792.7316284179688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1917852005753556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.070366262144076, + "kl": 0.03057861328125, + "learning_rate": 9.77277014971972e-07, + "loss": 0.0748, + "num_tokens": 559532430.0, + "reward": 1.481584906578064, + "reward_std": 0.36257320642471313, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19802019000053406, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 985.65185546875, + "completions/mean_terminated_length": 827.6615600585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19199829524266154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1426730344604902, + "kl": 0.019012451171875, + "learning_rate": 9.771718627605047e-07, + "loss": 0.0807, + "num_tokens": 560046370.0, + "reward": 1.3577009439468384, + "reward_std": 0.4410659372806549, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.22125105559825897, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1115.6138916015625, + "completions/mean_terminated_length": 887.697265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1922113899099675, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13223184006332392, + "kl": 0.018707275390625, + "learning_rate": 9.770664741421085e-07, + "loss": 0.0663, + "num_tokens": 560612837.0, + "reward": 1.2935268878936768, + "reward_std": 0.39318031072616577, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8917410969734192, + "rewards/tag_count_reward/std": 0.24405431747436523, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1065.84375, + "completions/mean_terminated_length": 868.3592529296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19242448457727346, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13569956211093284, + "kl": 0.01776123046875, + "learning_rate": 9.769608491751079e-07, + "loss": 0.0955, + "num_tokens": 561161311.0, + "reward": 1.5407366752624512, + "reward_std": 0.31699052453041077, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.22521603107452393, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 949.5692138671875, + "completions/mean_terminated_length": 782.9691772460938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.19263757924457942, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12752665561561735, + "kl": 0.01953125, + "learning_rate": 9.768549879179584e-07, + "loss": 0.0799, + "num_tokens": 561655502.0, + "reward": 1.4860491752624512, + "reward_std": 0.2688225209712982, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.21122872829437256, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 981.9933471679688, + "completions/mean_terminated_length": 820.31103515625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.19285067391188535, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12397150682089524, + "kl": 0.019775390625, + "learning_rate": 9.76748890429246e-07, + "loss": 0.0794, + "num_tokens": 562163355.0, + "reward": 1.4168527126312256, + "reward_std": 0.3452180325984955, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.19683773815631866, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1036.243408203125, + "completions/mean_terminated_length": 852.0448608398438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1930637685791913, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12215324487839498, + "kl": 0.02081298828125, + "learning_rate": 9.766425567676879e-07, + "loss": 0.0719, + "num_tokens": 562688680.0, + "reward": 1.520647406578064, + "reward_std": 0.3321880102157593, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.19764786958694458, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1062.8125, + "completions/mean_terminated_length": 818.5737915039062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.19327686324649726, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12175350614508576, + "kl": 0.020294189453125, + "learning_rate": 9.76535986992131e-07, + "loss": 0.0771, + "num_tokens": 563235780.0, + "reward": 1.4196429252624512, + "reward_std": 0.34208735823631287, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.2173006236553192, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 990.7678833007812, + "completions/mean_terminated_length": 848.911376953125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.19348995791380322, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12556871762295957, + "kl": 0.0185546875, + "learning_rate": 9.76429181161554e-07, + "loss": 0.0728, + "num_tokens": 563744556.0, + "reward": 1.4676339626312256, + "reward_std": 0.3798133134841919, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.21333187818527222, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1109.10498046875, + "completions/mean_terminated_length": 941.0921630859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19370305258110915, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11917942891502653, + "kl": 0.016265869140625, + "learning_rate": 9.763221393350655e-07, + "loss": 0.072, + "num_tokens": 564316923.0, + "reward": 1.2885044813156128, + "reward_std": 0.38661423325538635, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.22735659778118134, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 950.7120971679688, + "completions/mean_terminated_length": 754.3552856445312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1939161472484151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14304975621724847, + "kl": 0.019500732421875, + "learning_rate": 9.76214861571905e-07, + "loss": 0.1274, + "num_tokens": 564820714.0, + "reward": 1.4609376192092896, + "reward_std": 0.3816933035850525, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.21748150885105133, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 982.1250610351562, + "completions/mean_terminated_length": 778.021240234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19412924191572106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14330406809622806, + "kl": 0.018280029296875, + "learning_rate": 9.761073479314429e-07, + "loss": 0.1053, + "num_tokens": 565329522.0, + "reward": 1.4648438692092896, + "reward_std": 0.4017872214317322, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9001116156578064, + "rewards/tag_count_reward/std": 0.24218197166919708, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 936.9129638671875, + "completions/mean_terminated_length": 771.6743774414062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19434233658302702, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13802183683128744, + "kl": 0.019317626953125, + "learning_rate": 9.759995984731792e-07, + "loss": 0.1062, + "num_tokens": 565816235.0, + "reward": 1.4246652126312256, + "reward_std": 0.34561508893966675, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.2086479365825653, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1087.4754638671875, + "completions/mean_terminated_length": 865.8159790039062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.19455543125033295, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12723289260065085, + "kl": 0.018402099609375, + "learning_rate": 9.758916132567452e-07, + "loss": 0.0986, + "num_tokens": 566381024.0, + "reward": 1.3805804252624512, + "reward_std": 0.39436572790145874, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8783482313156128, + "rewards/tag_count_reward/std": 0.2643847167491913, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1082.4710693359375, + "completions/mean_terminated_length": 859.6566162109375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1947685259176389, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10964048619791143, + "kl": 0.0162811279296875, + "learning_rate": 9.757833923419027e-07, + "loss": 0.0478, + "num_tokens": 566941059.0, + "reward": 1.3856027126312256, + "reward_std": 0.38387662172317505, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19082863628864288, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1084.04248046875, + "completions/mean_terminated_length": 864.8411254882812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19498162058494486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23455995557445197, + "kl": 0.023468017578125, + "learning_rate": 9.756749357885433e-07, + "loss": 0.0575, + "num_tokens": 567491926.0, + "reward": 1.3934152126312256, + "reward_std": 0.3877316117286682, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8844866156578064, + "rewards/tag_count_reward/std": 0.25672033429145813, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 967.185302734375, + "completions/mean_terminated_length": 803.257080078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.19519471525225082, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11767722105789222, + "kl": 0.01898193359375, + "learning_rate": 9.755662436566897e-07, + "loss": 0.0508, + "num_tokens": 567994185.0, + "reward": 1.4045759439468384, + "reward_std": 0.3792615830898285, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.20777854323387146, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 944.1295166015625, + "completions/mean_terminated_length": 776.704345703125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.19540780991955675, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14836465821152875, + "kl": 0.019439697265625, + "learning_rate": 9.754573160064944e-07, + "loss": 0.0885, + "num_tokens": 568483139.0, + "reward": 1.5139509439468384, + "reward_std": 0.35800066590309143, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.20454494655132294, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 924.7879638671875, + "completions/mean_terminated_length": 774.0784912109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1956209045868627, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13832063026854555, + "kl": 0.021636962890625, + "learning_rate": 9.753481528982407e-07, + "loss": 0.0423, + "num_tokens": 568970276.0, + "reward": 1.4263393878936768, + "reward_std": 0.3221184313297272, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8950892686843872, + "rewards/tag_count_reward/std": 0.2296247035264969, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 964.40185546875, + "completions/mean_terminated_length": 773.8477783203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19583399925416867, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13684636074427955, + "kl": 0.020751953125, + "learning_rate": 9.75238754392342e-07, + "loss": 0.0959, + "num_tokens": 569467960.0, + "reward": 1.485491156578064, + "reward_std": 0.3128793239593506, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.18356364965438843, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1100.875, + "completions/mean_terminated_length": 907.3763427734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19604709392147462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12732427143682615, + "kl": 0.0170135498046875, + "learning_rate": 9.751291205493421e-07, + "loss": 0.1193, + "num_tokens": 570034688.0, + "reward": 1.2712054252624512, + "reward_std": 0.4475001096725464, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8805803656578064, + "rewards/tag_count_reward/std": 0.24965769052505493, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 970.2723388671875, + "completions/mean_terminated_length": 806.8123168945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19626018858878055, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1363261609007378, + "kl": 0.019195556640625, + "learning_rate": 9.750192514299148e-07, + "loss": 0.1066, + "num_tokens": 570543770.0, + "reward": 1.4179688692092896, + "reward_std": 0.32332125306129456, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.2093229591846466, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 978.7031860351562, + "completions/mean_terminated_length": 803.7272338867188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.1964732832560865, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6075809438990477, + "kl": 0.02862548828125, + "learning_rate": 9.749091470948643e-07, + "loss": 0.0856, + "num_tokens": 571047877.0, + "reward": 1.4051339626312256, + "reward_std": 0.41178596019744873, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.2339629977941513, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1040.415283203125, + "completions/mean_terminated_length": 850.6578369140625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.19668637792339247, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13129730402658027, + "kl": 0.020111083984375, + "learning_rate": 9.74798807605125e-07, + "loss": 0.1109, + "num_tokens": 571580575.0, + "reward": 1.419084906578064, + "reward_std": 0.3938526213169098, + "rewards/accuracy_reward/mean": 0.5462962985038757, + "rewards/accuracy_reward/std": 0.4984292685985565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8922991156578064, + "rewards/tag_count_reward/std": 0.24171243607997894, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1082.2210693359375, + "completions/mean_terminated_length": 808.2608032226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19689947259069843, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11678656002404714, + "kl": 0.0174560546875, + "learning_rate": 9.74688233021761e-07, + "loss": 0.0938, + "num_tokens": 572135458.0, + "reward": 1.3705357313156128, + "reward_std": 0.32974135875701904, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8950892686843872, + "rewards/tag_count_reward/std": 0.2460842877626419, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 945.3906860351562, + "completions/mean_terminated_length": 737.7373657226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19711256725800436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1459764898162939, + "kl": 0.02191162109375, + "learning_rate": 9.745774234059673e-07, + "loss": 0.0963, + "num_tokens": 572631121.0, + "reward": 1.360491156578064, + "reward_std": 0.3288823068141937, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.2594851851463318, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1054.560302734375, + "completions/mean_terminated_length": 815.14404296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19732566192531031, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14028015448548256, + "kl": 0.017425537109375, + "learning_rate": 9.744663788190685e-07, + "loss": 0.0873, + "num_tokens": 573169756.0, + "reward": 1.3805804252624512, + "reward_std": 0.3514879047870636, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8917410969734192, + "rewards/tag_count_reward/std": 0.24462655186653137, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1018.1317138671875, + "completions/mean_terminated_length": 814.3609619140625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.19753875659261627, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.6730480984492418, + "kl": 0.026824951171875, + "learning_rate": 9.743550993225188e-07, + "loss": 0.1333, + "num_tokens": 573691767.0, + "reward": 1.399553656578064, + "reward_std": 0.3845144212245941, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8861607313156128, + "rewards/tag_count_reward/std": 0.24265125393867493, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 994.9285888671875, + "completions/mean_terminated_length": 786.5668334960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19775185125992223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4178950857908417, + "kl": 0.136962890625, + "learning_rate": 9.742435849779036e-07, + "loss": 0.0643, + "num_tokens": 574219575.0, + "reward": 1.356584906578064, + "reward_std": 0.27073487639427185, + "rewards/accuracy_reward/mean": 0.48148149251937866, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8922991156578064, + "rewards/tag_count_reward/std": 0.22180332243442535, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1133.930908203125, + "completions/mean_terminated_length": 825.6029663085938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.19796494592722816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13873784544169512, + "kl": 0.018157958984375, + "learning_rate": 9.741318358469371e-07, + "loss": 0.1329, + "num_tokens": 574802744.0, + "reward": 1.2650669813156128, + "reward_std": 0.3683628439903259, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8677455186843872, + "rewards/tag_count_reward/std": 0.26798370480537415, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1056.1875, + "completions/mean_terminated_length": 881.7742919921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.19817804059453412, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12246915877249327, + "kl": 0.01800537109375, + "learning_rate": 9.740198519914637e-07, + "loss": 0.0594, + "num_tokens": 575345868.0, + "reward": 1.4626116752624512, + "reward_std": 0.35262539982795715, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.19353099167346954, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1155.97998046875, + "completions/mean_terminated_length": 906.2142944335938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.19839113526184007, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12915501802536825, + "kl": 0.01666259765625, + "learning_rate": 9.739076334734585e-07, + "loss": 0.0976, + "num_tokens": 575937891.0, + "reward": 1.2767857313156128, + "reward_std": 0.4358396828174591, + "rewards/accuracy_reward/mean": 0.4447115361690521, + "rewards/accuracy_reward/std": 0.4975321590900421, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8638392686843872, + "rewards/tag_count_reward/std": 0.28406286239624023, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1091.430908203125, + "completions/mean_terminated_length": 899.0911865234375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.19860422992914603, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12984489417640183, + "kl": 0.019134521484375, + "learning_rate": 9.737951803550256e-07, + "loss": 0.0688, + "num_tokens": 576489092.0, + "reward": 1.3236607313156128, + "reward_std": 0.42935460805892944, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.26121383905410767, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 954.6406860351562, + "completions/mean_terminated_length": 755.5857543945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19881732459645196, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.4650473472878477, + "kl": 0.04248046875, + "learning_rate": 9.73682492698399e-07, + "loss": 0.0613, + "num_tokens": 576989411.0, + "reward": 1.380022406578064, + "reward_std": 0.34238237142562866, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.24317994713783264, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 953.0826416015625, + "completions/mean_terminated_length": 799.849853515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19903041926375792, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13306694495655733, + "kl": 0.019744873046875, + "learning_rate": 9.735695705659428e-07, + "loss": 0.0816, + "num_tokens": 577484504.0, + "reward": 1.4916294813156128, + "reward_std": 0.3948051631450653, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.21563898026943207, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1036.950927734375, + "completions/mean_terminated_length": 840.13330078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19924351393106388, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12593073542131225, + "kl": 0.018402099609375, + "learning_rate": 9.734564140201506e-07, + "loss": 0.0807, + "num_tokens": 578014226.0, + "reward": 1.430803656578064, + "reward_std": 0.4254690706729889, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.2155933827161789, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 966.7813110351562, + "completions/mean_terminated_length": 745.8870849609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.19945660859836983, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1410035352138314, + "kl": 0.017333984375, + "learning_rate": 9.733430231236462e-07, + "loss": 0.1123, + "num_tokens": 578517568.0, + "reward": 1.3939732313156128, + "reward_std": 0.3960471451282501, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.26002347469329834, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1029.1607666015625, + "completions/mean_terminated_length": 853.1309204101562, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.19966970326567576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12785298909646056, + "kl": 0.017852783203125, + "learning_rate": 9.732293979391826e-07, + "loss": 0.0969, + "num_tokens": 579045784.0, + "reward": 1.4536831378936768, + "reward_std": 0.4685271382331848, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.8733258843421936, + "rewards/tag_count_reward/std": 0.2606726288795471, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1041.024658203125, + "completions/mean_terminated_length": 794.875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.19988279793298172, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.3709747368676183, + "kl": 0.03118896484375, + "learning_rate": 9.731155385296428e-07, + "loss": 0.0539, + "num_tokens": 579585123.0, + "reward": 1.3705357313156128, + "reward_std": 0.4214267432689667, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.27125784754753113, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1168.3304443359375, + "completions/mean_terminated_length": 875.107177734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20009589260028768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11865719025963911, + "kl": 0.016998291015625, + "learning_rate": 9.730014449580391e-07, + "loss": 0.0753, + "num_tokens": 580181543.0, + "reward": 1.2739956378936768, + "reward_std": 0.47678273916244507, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8565848469734192, + "rewards/tag_count_reward/std": 0.27262529730796814, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 974.0826416015625, + "completions/mean_terminated_length": 754.6801147460938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.20030898726759364, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12893348555157633, + "kl": 0.018341064453125, + "learning_rate": 9.728871172875137e-07, + "loss": 0.0516, + "num_tokens": 580685884.0, + "reward": 1.376116156578064, + "reward_std": 0.35923895239830017, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.2523038983345032, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1069.0023193359375, + "completions/mean_terminated_length": 829.6917114257812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.20052208193489957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1423227322672156, + "kl": 0.020416259765625, + "learning_rate": 9.727725555813383e-07, + "loss": 0.085, + "num_tokens": 581233565.0, + "reward": 1.30859375, + "reward_std": 0.4114075303077698, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8777901530265808, + "rewards/tag_count_reward/std": 0.2643912136554718, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1026.638427734375, + "completions/mean_terminated_length": 790.9395751953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20073517660220552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15731581995196475, + "kl": 0.018951416015625, + "learning_rate": 9.726577599029134e-07, + "loss": 0.1021, + "num_tokens": 581766891.0, + "reward": 1.4207589626312256, + "reward_std": 0.43133798241615295, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.251193106174469, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1126.0023193359375, + "completions/mean_terminated_length": 836.6950073242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20094827126951148, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12358701980067863, + "kl": 0.016143798828125, + "learning_rate": 9.725427303157703e-07, + "loss": 0.0376, + "num_tokens": 582337580.0, + "reward": 1.3141741752624512, + "reward_std": 0.4282354414463043, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8699776530265808, + "rewards/tag_count_reward/std": 0.26223400235176086, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1195.794677734375, + "completions/mean_terminated_length": 904.9221801757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20116136593681744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11546037429775202, + "kl": 0.014984130859375, + "learning_rate": 9.72427466883569e-07, + "loss": 0.0923, + "num_tokens": 582949344.0, + "reward": 1.2287946939468384, + "reward_std": 0.4299945831298828, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.295629620552063, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1017.6406860351562, + "completions/mean_terminated_length": 793.6494750976562, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2013744606041234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12424074605822494, + "kl": 0.01788330078125, + "learning_rate": 9.723119696700987e-07, + "loss": 0.0703, + "num_tokens": 583470175.0, + "reward": 1.4419643878936768, + "reward_std": 0.3329524099826813, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8683035969734192, + "rewards/tag_count_reward/std": 0.26590317487716675, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1114.732177734375, + "completions/mean_terminated_length": 829.0379028320312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.20158755527142933, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13189317693351543, + "kl": 0.017059326171875, + "learning_rate": 9.721962387392784e-07, + "loss": 0.1039, + "num_tokens": 584038135.0, + "reward": 1.2779018878936768, + "reward_std": 0.3820245862007141, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8448660969734192, + "rewards/tag_count_reward/std": 0.28168630599975586, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1045.0023193359375, + "completions/mean_terminated_length": 830.268310546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.20180064993873528, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.60896568534435, + "kl": 0.195281982421875, + "learning_rate": 9.720802741551565e-07, + "loss": 0.1206, + "num_tokens": 584572952.0, + "reward": 1.5000001192092896, + "reward_std": 0.4759651720523834, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8660714030265808, + "rewards/tag_count_reward/std": 0.26159584522247314, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1046.2879638671875, + "completions/mean_terminated_length": 808.3121948242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20201374460604124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1278418720203623, + "kl": 0.0176239013671875, + "learning_rate": 9.7196407598191e-07, + "loss": 0.1014, + "num_tokens": 585113065.0, + "reward": 1.4313616752624512, + "reward_std": 0.4454249441623688, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8599330186843872, + "rewards/tag_count_reward/std": 0.2794141173362732, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1114.1160888671875, + "completions/mean_terminated_length": 776.3282470703125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2022268392733472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12959005263771578, + "kl": 0.01763916015625, + "learning_rate": 9.718476442838464e-07, + "loss": 0.0878, + "num_tokens": 585682429.0, + "reward": 1.3325893878936768, + "reward_std": 0.3996261954307556, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8482142686843872, + "rewards/tag_count_reward/std": 0.2835085690021515, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1121.384033203125, + "completions/mean_terminated_length": 878.6365966796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20243993394065313, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12983375769191466, + "kl": 0.0159912109375, + "learning_rate": 9.71730979125401e-07, + "loss": 0.0948, + "num_tokens": 586257257.0, + "reward": 1.258928656578064, + "reward_std": 0.4123886823654175, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8348214030265808, + "rewards/tag_count_reward/std": 0.29877373576164246, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1118.1160888671875, + "completions/mean_terminated_length": 897.2044677734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20265302860795908, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12899402720112513, + "kl": 0.02099609375, + "learning_rate": 9.7161408057114e-07, + "loss": 0.0834, + "num_tokens": 586826573.0, + "reward": 1.426897406578064, + "reward_std": 0.406277596950531, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8643973469734192, + "rewards/tag_count_reward/std": 0.26472151279449463, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 990.9308471679688, + "completions/mean_terminated_length": 754.10107421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20286612327526504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13524494729078168, + "kl": 0.02020263671875, + "learning_rate": 9.714969486857567e-07, + "loss": 0.0669, + "num_tokens": 587334478.0, + "reward": 1.4626116752624512, + "reward_std": 0.41052141785621643, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2727077007293701, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1147.2701416015625, + "completions/mean_terminated_length": 951.459228515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.203079217942571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11986755276536806, + "kl": 0.0160369873046875, + "learning_rate": 9.713795835340753e-07, + "loss": 0.0699, + "num_tokens": 587922439.0, + "reward": 1.4765626192092896, + "reward_std": 0.44435131549835205, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8627232313156128, + "rewards/tag_count_reward/std": 0.27348312735557556, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1094.28125, + "completions/mean_terminated_length": 861.1500244140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20329231260987693, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11733210974900807, + "kl": 0.0179595947265625, + "learning_rate": 9.712619851810482e-07, + "loss": 0.0794, + "num_tokens": 588480805.0, + "reward": 1.4402902126312256, + "reward_std": 0.38445407152175903, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2626193165779114, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1032.18310546875, + "completions/mean_terminated_length": 840.8753051757812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2035054072771829, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14104131828314123, + "kl": 0.0194091796875, + "learning_rate": 9.711441536917573e-07, + "loss": 0.049, + "num_tokens": 589009511.0, + "reward": 1.422991156578064, + "reward_std": 0.4051731824874878, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8761160969734192, + "rewards/tag_count_reward/std": 0.2525017559528351, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1006.1094360351562, + "completions/mean_terminated_length": 758.5884399414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.20371850194448884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14643915796263005, + "kl": 0.019134521484375, + "learning_rate": 9.710260891314131e-07, + "loss": 0.1108, + "num_tokens": 589528568.0, + "reward": 1.4626116752624512, + "reward_std": 0.39833319187164307, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8889508843421936, + "rewards/tag_count_reward/std": 0.2425065040588379, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1052.4910888671875, + "completions/mean_terminated_length": 849.1075439453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2039315966117948, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13649285890991408, + "kl": 0.0186767578125, + "learning_rate": 9.709077915653552e-07, + "loss": 0.1267, + "num_tokens": 590075956.0, + "reward": 1.4419643878936768, + "reward_std": 0.37415704131126404, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8638392686843872, + "rewards/tag_count_reward/std": 0.2709631323814392, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1075.071533203125, + "completions/mean_terminated_length": 816.72314453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20414469127910073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13186539270898398, + "kl": 0.017486572265625, + "learning_rate": 9.707892610590526e-07, + "loss": 0.132, + "num_tokens": 590623316.0, + "reward": 1.3844866752624512, + "reward_std": 0.4485476613044739, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8777901530265808, + "rewards/tag_count_reward/std": 0.2574244737625122, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1113.32373046875, + "completions/mean_terminated_length": 878.34912109375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2043577859464067, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10905023303057652, + "kl": 0.0164947509765625, + "learning_rate": 9.70670497678103e-07, + "loss": -0.0068, + "num_tokens": 591186837.0, + "reward": 1.3973214626312256, + "reward_std": 0.30831965804100037, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.21637944877147675, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 982.950927734375, + "completions/mean_terminated_length": 765.3602294921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20457088061371265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1307998086008599, + "kl": 0.01953125, + "learning_rate": 9.70551501488232e-07, + "loss": 0.1472, + "num_tokens": 591694015.0, + "reward": 1.4631696939468384, + "reward_std": 0.3534601926803589, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.2444223314523697, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1060.77685546875, + "completions/mean_terminated_length": 819.45556640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2047839752810186, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13679705701679346, + "kl": 0.0171051025390625, + "learning_rate": 9.704322725552956e-07, + "loss": 0.1244, + "num_tokens": 592242475.0, + "reward": 1.3303571939468384, + "reward_std": 0.34770551323890686, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8883928656578064, + "rewards/tag_count_reward/std": 0.24710693955421448, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 971.0826416015625, + "completions/mean_terminated_length": 747.5714111328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20499706994832453, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.2458819804723144, + "kl": 0.026763916015625, + "learning_rate": 9.703128109452775e-07, + "loss": 0.125, + "num_tokens": 592753440.0, + "reward": 1.4520089626312256, + "reward_std": 0.326716810464859, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8783482313156128, + "rewards/tag_count_reward/std": 0.2590422034263611, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 981.7076416015625, + "completions/mean_terminated_length": 780.8938598632812, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2052101646156305, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.670081992700361, + "kl": 0.018829345703125, + "learning_rate": 9.70193116724291e-07, + "loss": 0.0791, + "num_tokens": 593264813.0, + "reward": 1.422991156578064, + "reward_std": 0.3601451814174652, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8783482313156128, + "rewards/tag_count_reward/std": 0.25468748807907104, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1033.321533203125, + "completions/mean_terminated_length": 809.373291015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20542325928293645, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.135578193296156, + "kl": 0.018798828125, + "learning_rate": 9.700731899585773e-07, + "loss": 0.083, + "num_tokens": 593799933.0, + "reward": 1.387834906578064, + "reward_std": 0.3177962899208069, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.1878284513950348, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 988.8683471679688, + "completions/mean_terminated_length": 818.7486572265625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2056363539502424, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13332120652966487, + "kl": 0.018829345703125, + "learning_rate": 9.699530307145067e-07, + "loss": 0.0661, + "num_tokens": 594313154.0, + "reward": 1.4135044813156128, + "reward_std": 0.36688777804374695, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.21394765377044678, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1111.7523193359375, + "completions/mean_terminated_length": 859.7875366210938, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.20584944861754834, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12360263314184435, + "kl": 0.016204833984375, + "learning_rate": 9.698326390585784e-07, + "loss": 0.1081, + "num_tokens": 594880867.0, + "reward": 1.3532366752624512, + "reward_std": 0.3957882225513458, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.24432718753814697, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 977.3594360351562, + "completions/mean_terminated_length": 795.657958984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2060625432848543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9560655732420187, + "kl": 0.020111083984375, + "learning_rate": 9.697120150574198e-07, + "loss": 0.1219, + "num_tokens": 595383156.0, + "reward": 1.4056919813156128, + "reward_std": 0.38246452808380127, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9034598469734192, + "rewards/tag_count_reward/std": 0.2231668382883072, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1062.513427734375, + "completions/mean_terminated_length": 848.2771606445312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.20627563795216025, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12946985131841315, + "kl": 0.02044677734375, + "learning_rate": 9.695911587777873e-07, + "loss": 0.0726, + "num_tokens": 595926058.0, + "reward": 1.469866156578064, + "reward_std": 0.43958985805511475, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.25340983271598816, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1058.5535888671875, + "completions/mean_terminated_length": 820.0996704101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2064887326194662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13937668542452109, + "kl": 0.019561767578125, + "learning_rate": 9.69470070286565e-07, + "loss": 0.0981, + "num_tokens": 596477010.0, + "reward": 1.4023438692092896, + "reward_std": 0.3652377426624298, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8978794813156128, + "rewards/tag_count_reward/std": 0.2424035370349884, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1063.203125, + "completions/mean_terminated_length": 852.3658447265625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.20670182728677214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17332028545550354, + "kl": 0.017547607421875, + "learning_rate": 9.693487496507668e-07, + "loss": 0.1219, + "num_tokens": 597021725.0, + "reward": 1.391741156578064, + "reward_std": 0.37701690196990967, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.23568548262119293, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1080.24560546875, + "completions/mean_terminated_length": 819.8016967773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2069149219540781, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1269790332033462, + "kl": 0.017303466796875, + "learning_rate": 9.692271969375341e-07, + "loss": 0.0978, + "num_tokens": 597572923.0, + "reward": 1.3872768878936768, + "reward_std": 0.44391947984695435, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8738839030265808, + "rewards/tag_count_reward/std": 0.2654591202735901, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1104.375, + "completions/mean_terminated_length": 836.6991577148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20712801662138405, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12195207244859864, + "kl": 0.017547607421875, + "learning_rate": 9.691054122141368e-07, + "loss": 0.0343, + "num_tokens": 598140963.0, + "reward": 1.3632813692092896, + "reward_std": 0.38065415620803833, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.23745527863502502, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1120.25, + "completions/mean_terminated_length": 890.2506713867188, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.20734111128869, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11576084163125788, + "kl": 0.01727294921875, + "learning_rate": 9.689833955479737e-07, + "loss": 0.1008, + "num_tokens": 598709891.0, + "reward": 1.481584906578064, + "reward_std": 0.34943288564682007, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.24384641647338867, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1031.044677734375, + "completions/mean_terminated_length": 771.8207397460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20755420595599594, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1299264245472782, + "kl": 0.02093505859375, + "learning_rate": 9.688611470065716e-07, + "loss": 0.1242, + "num_tokens": 599236951.0, + "reward": 1.4888393878936768, + "reward_std": 0.428301066160202, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.2481052279472351, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1149.2410888671875, + "completions/mean_terminated_length": 890.9769897460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2077673006233019, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 1.0732657490736828, + "kl": 0.0911865234375, + "learning_rate": 9.687386666575858e-07, + "loss": 0.108, + "num_tokens": 599824643.0, + "reward": 1.3844866752624512, + "reward_std": 0.4030228555202484, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8376116156578064, + "rewards/tag_count_reward/std": 0.3123975992202759, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1089.33935546875, + "completions/mean_terminated_length": 827.8864135742188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.20798039529060786, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13010300635074162, + "kl": 0.018310546875, + "learning_rate": 9.686159545687996e-07, + "loss": 0.1124, + "num_tokens": 600384011.0, + "reward": 1.2918527126312256, + "reward_std": 0.366824746131897, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2403814047574997, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1138.493408203125, + "completions/mean_terminated_length": 909.8463134765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2081934899579138, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11307137668872197, + "kl": 0.017059326171875, + "learning_rate": 9.684930108081249e-07, + "loss": 0.0679, + "num_tokens": 600965336.0, + "reward": 1.3867188692092896, + "reward_std": 0.3642430603504181, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.2267407774925232, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1137.055908203125, + "completions/mean_terminated_length": 861.654052734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20840658462521974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1240092416090791, + "kl": 0.016754150390625, + "learning_rate": 9.683698354436016e-07, + "loss": 0.1253, + "num_tokens": 601545697.0, + "reward": 1.3727679252624512, + "reward_std": 0.39141160249710083, + "rewards/accuracy_reward/mean": 0.49074074625968933, + "rewards/accuracy_reward/std": 0.5004938244819641, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8995535969734192, + "rewards/tag_count_reward/std": 0.24453723430633545, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1168.044677734375, + "completions/mean_terminated_length": 905.3333740234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2086196792925257, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11952483737327563, + "kl": 0.0171661376953125, + "learning_rate": 9.682464285433979e-07, + "loss": 0.112, + "num_tokens": 602139285.0, + "reward": 1.3108259439468384, + "reward_std": 0.3328397274017334, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8844866156578064, + "rewards/tag_count_reward/std": 0.2631748914718628, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1042.046875, + "completions/mean_terminated_length": 820.0245361328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20883277395983166, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1375594131993578, + "kl": 0.01861572265625, + "learning_rate": 9.681227901758101e-07, + "loss": 0.1376, + "num_tokens": 602675402.0, + "reward": 1.4280134439468384, + "reward_std": 0.36848947405815125, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8967633843421936, + "rewards/tag_count_reward/std": 0.2407706081867218, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1055.0379638671875, + "completions/mean_terminated_length": 801.9299926757812, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.20904586862713762, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12567605039222451, + "kl": 0.018524169921875, + "learning_rate": 9.679989204092624e-07, + "loss": 0.0777, + "num_tokens": 603214923.0, + "reward": 1.563616156578064, + "reward_std": 0.35769954323768616, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.20945784449577332, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1084.3929443359375, + "completions/mean_terminated_length": 858.7548217773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20925896329444355, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.130230622770987, + "kl": 0.01715087890625, + "learning_rate": 9.678748193123075e-07, + "loss": 0.1088, + "num_tokens": 603761483.0, + "reward": 1.3895089626312256, + "reward_std": 0.3803202211856842, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.22568464279174805, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1104.602783203125, + "completions/mean_terminated_length": 870.7242431640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2094720579617495, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12281837312874369, + "kl": 0.0163116455078125, + "learning_rate": 9.677504869536255e-07, + "loss": 0.0622, + "num_tokens": 604324297.0, + "reward": 1.4246652126312256, + "reward_std": 0.31938737630844116, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.1837497353553772, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1051.3013916015625, + "completions/mean_terminated_length": 850.8927612304688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.20968515262905546, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12220977660752079, + "kl": 0.018341064453125, + "learning_rate": 9.67625923402025e-07, + "loss": 0.0494, + "num_tokens": 604861360.0, + "reward": 1.3867188692092896, + "reward_std": 0.35236823558807373, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21263070404529572, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1087.8348388671875, + "completions/mean_terminated_length": 910.0264282226562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.20989824729636142, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11963464126886837, + "kl": 0.01898193359375, + "learning_rate": 9.675011287264427e-07, + "loss": 0.0675, + "num_tokens": 605419510.0, + "reward": 1.3030134439468384, + "reward_std": 0.33659690618515015, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.4903542101383209, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9034598469734192, + "rewards/tag_count_reward/std": 0.2347692996263504, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 998.779052734375, + "completions/mean_terminated_length": 784.4220581054688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.21011134196366735, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1318557569983656, + "kl": 0.017242431640625, + "learning_rate": 9.673761029959426e-07, + "loss": 0.098, + "num_tokens": 605930915.0, + "reward": 1.4475446939468384, + "reward_std": 0.37351909279823303, + "rewards/accuracy_reward/mean": 0.5509259104728699, + "rewards/accuracy_reward/std": 0.49797651171684265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.22552968561649323, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1087.180908203125, + "completions/mean_terminated_length": 814.6275634765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2103244366309733, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11838296641766269, + "kl": 0.0165557861328125, + "learning_rate": 9.672508462797168e-07, + "loss": 0.0581, + "num_tokens": 606491892.0, + "reward": 1.2717634439468384, + "reward_std": 0.3784574270248413, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8833705186843872, + "rewards/tag_count_reward/std": 0.2684724032878876, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1014.4598388671875, + "completions/mean_terminated_length": 796.578369140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.21053753129827926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1277266733273894, + "kl": 0.01873779296875, + "learning_rate": 9.671253586470854e-07, + "loss": 0.1051, + "num_tokens": 607018114.0, + "reward": 1.4977679252624512, + "reward_std": 0.36809930205345154, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.20721976459026337, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 845.325927734375, + "completions/mean_terminated_length": 714.341552734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.21075062596558522, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13946916096495987, + "kl": 0.02032470703125, + "learning_rate": 9.669996401674963e-07, + "loss": 0.0738, + "num_tokens": 607459892.0, + "reward": 1.6556919813156128, + "reward_std": 0.3361762762069702, + "rewards/accuracy_reward/mean": 0.7142857313156128, + "rewards/accuracy_reward/std": 0.45225897431373596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1818031221628189, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 993.7500610351562, + "completions/mean_terminated_length": 798.5184936523438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.21096372063289115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13296140771030554, + "kl": 0.019775390625, + "learning_rate": 9.66873690910525e-07, + "loss": 0.0335, + "num_tokens": 607980164.0, + "reward": 1.4977679252624512, + "reward_std": 0.35170823335647583, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.2077612727880478, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1008.5870971679688, + "completions/mean_terminated_length": 802.9277954101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2111768153001971, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13610661049162912, + "kl": 0.019073486328125, + "learning_rate": 9.667475109458747e-07, + "loss": 0.1097, + "num_tokens": 608494155.0, + "reward": 1.4341518878936768, + "reward_std": 0.36671048402786255, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.20146164298057556, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 949.76123046875, + "completions/mean_terminated_length": 749.8179931640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.21138990996750306, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13650530439734018, + "kl": 0.021575927734375, + "learning_rate": 9.66621100343376e-07, + "loss": 0.0825, + "num_tokens": 608983984.0, + "reward": 1.4475446939468384, + "reward_std": 0.31377193331718445, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.1880783587694168, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1067.55810546875, + "completions/mean_terminated_length": 803.69970703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.21160300463480902, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11904261223702872, + "kl": 0.018585205078125, + "learning_rate": 9.664944591729884e-07, + "loss": 0.0986, + "num_tokens": 609541210.0, + "reward": 1.290178656578064, + "reward_std": 0.38875582814216614, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8839285969734192, + "rewards/tag_count_reward/std": 0.25728440284729004, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1081.7076416015625, + "completions/mean_terminated_length": 778.50146484375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.21181609930211495, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15340419817053028, + "kl": 0.01947021484375, + "learning_rate": 9.663675875047974e-07, + "loss": 0.0662, + "num_tokens": 610101431.0, + "reward": 1.4006696939468384, + "reward_std": 0.32147231698036194, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.24499371647834778, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 942.4375610351562, + "completions/mean_terminated_length": 764.860107421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2120291939694209, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13584641220068627, + "kl": 0.019439697265625, + "learning_rate": 9.662404854090171e-07, + "loss": 0.0676, + "num_tokens": 610590763.0, + "reward": 1.5117188692092896, + "reward_std": 0.34111541509628296, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.2073153853416443, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 911.0960083007812, + "completions/mean_terminated_length": 742.0179443359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21224228863672687, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1248384385399846, + "kl": 0.02117919921875, + "learning_rate": 9.661131529559883e-07, + "loss": 0.0479, + "num_tokens": 611061974.0, + "reward": 1.5837054252624512, + "reward_std": 0.3017288148403168, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18348205089569092, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 933.7902221679688, + "completions/mean_terminated_length": 734.4052734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21245538330403282, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13126097036720397, + "kl": 0.022003173828125, + "learning_rate": 9.659855902161804e-07, + "loss": 0.0855, + "num_tokens": 611548440.0, + "reward": 1.6484376192092896, + "reward_std": 0.3285175859928131, + "rewards/accuracy_reward/mean": 0.7053571343421936, + "rewards/accuracy_reward/std": 0.45639166235923767, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.17569643259048462, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1068.5067138671875, + "completions/mean_terminated_length": 855.5733642578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21266847797133875, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12232301469490667, + "kl": 0.0171051025390625, + "learning_rate": 9.65857797260189e-07, + "loss": 0.0551, + "num_tokens": 612091419.0, + "reward": 1.356584906578064, + "reward_std": 0.3327273428440094, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.20082469284534454, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 989.7678833007812, + "completions/mean_terminated_length": 763.2086791992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2128815726386447, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.7245688070990458, + "kl": 0.0257568359375, + "learning_rate": 9.657297741587381e-07, + "loss": 0.0279, + "num_tokens": 612617267.0, + "reward": 1.4709821939468384, + "reward_std": 0.3218078017234802, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.4966535270214081, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.21720866858959198, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1081.8773193359375, + "completions/mean_terminated_length": 845.7139282226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21309466730595067, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12125873848436415, + "kl": 0.017578125, + "learning_rate": 9.656015209826788e-07, + "loss": 0.0575, + "num_tokens": 613170508.0, + "reward": 1.4213169813156128, + "reward_std": 0.3136279284954071, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.21439646184444427, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1204.328125, + "completions/mean_terminated_length": 902.6514892578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21330776197325663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13076941046090076, + "kl": 0.01617431640625, + "learning_rate": 9.654730378029892e-07, + "loss": 0.1085, + "num_tokens": 613780783.0, + "reward": 1.2126116752624512, + "reward_std": 0.3802720606327057, + "rewards/accuracy_reward/mean": 0.3214285671710968, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8911830186843872, + "rewards/tag_count_reward/std": 0.2547432482242584, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 952.4620971679688, + "completions/mean_terminated_length": 811.7254028320312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21352085664056256, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13414171984312873, + "kl": 0.021392822265625, + "learning_rate": 9.653443246907748e-07, + "loss": 0.0751, + "num_tokens": 614274334.0, + "reward": 1.5412946939468384, + "reward_std": 0.3479004204273224, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.18955937027931213, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 944.7857666015625, + "completions/mean_terminated_length": 764.2597045898438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2137339513078685, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12487884729558162, + "kl": 0.021026611328125, + "learning_rate": 9.652153817172686e-07, + "loss": 0.0764, + "num_tokens": 614770654.0, + "reward": 1.4849331378936768, + "reward_std": 0.3466637432575226, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21440230309963226, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 936.997802734375, + "completions/mean_terminated_length": 784.7283935546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21394704597517447, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1334543784185448, + "kl": 0.020751953125, + "learning_rate": 9.650862089538307e-07, + "loss": 0.0746, + "num_tokens": 615259501.0, + "reward": 1.4681919813156128, + "reward_std": 0.3186980187892914, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.19992130994796753, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1122.5982666015625, + "completions/mean_terminated_length": 853.2449340820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21416014064248043, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.130870093656659, + "kl": 0.0172882080078125, + "learning_rate": 9.649568064719482e-07, + "loss": 0.1053, + "num_tokens": 615839913.0, + "reward": 1.2946429252624512, + "reward_std": 0.35064026713371277, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.23377349972724915, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1087.259033203125, + "completions/mean_terminated_length": 884.724365234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2143732353097864, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11480697121033301, + "kl": 0.017730712890625, + "learning_rate": 9.648271743432355e-07, + "loss": 0.054, + "num_tokens": 616394077.0, + "reward": 1.4960938692092896, + "reward_std": 0.4023757874965668, + "rewards/accuracy_reward/mean": 0.6064814925193787, + "rewards/accuracy_reward/std": 0.4890965521335602, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.22577519714832306, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1120.54248046875, + "completions/mean_terminated_length": 860.854248046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21458632997709232, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13828937044665743, + "kl": 0.016998291015625, + "learning_rate": 9.646973126394341e-07, + "loss": 0.1485, + "num_tokens": 616973168.0, + "reward": 1.2388393878936768, + "reward_std": 0.388698935508728, + "rewards/accuracy_reward/mean": 0.3541666567325592, + "rewards/accuracy_reward/std": 0.4788145422935486, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8973214030265808, + "rewards/tag_count_reward/std": 0.24759145081043243, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1159.546875, + "completions/mean_terminated_length": 914.0199584960938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.21479942464439827, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12224700834142749, + "kl": 0.016021728515625, + "learning_rate": 9.64567221432413e-07, + "loss": 0.0752, + "num_tokens": 617563621.0, + "reward": 1.340959906578064, + "reward_std": 0.33923614025115967, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2053246796131134, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1085.8951416015625, + "completions/mean_terminated_length": 854.0304565429688, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.21501251931170423, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14645432242150813, + "kl": 0.0203857421875, + "learning_rate": 9.644369007941667e-07, + "loss": 0.1271, + "num_tokens": 618128566.0, + "reward": 1.333147406578064, + "reward_std": 0.34795358777046204, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.2119780033826828, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1019.1808471679688, + "completions/mean_terminated_length": 778.2727661132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2152256139790102, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1338888170324766, + "kl": 0.0184326171875, + "learning_rate": 9.643063507968185e-07, + "loss": 0.1049, + "num_tokens": 618658055.0, + "reward": 1.4481027126312256, + "reward_std": 0.37270089983940125, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8989955186843872, + "rewards/tag_count_reward/std": 0.24344676733016968, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1167.625, + "completions/mean_terminated_length": 949.3704833984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.21543870864631612, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11457849712008734, + "kl": 0.0164794921875, + "learning_rate": 9.641755715126176e-07, + "loss": 0.0632, + "num_tokens": 619253631.0, + "reward": 1.3621652126312256, + "reward_std": 0.3880992829799652, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.22919411957263947, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1028.243408203125, + "completions/mean_terminated_length": 836.193603515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21565180331362208, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12193408775829491, + "kl": 0.018157958984375, + "learning_rate": 9.6404456301394e-07, + "loss": 0.0259, + "num_tokens": 619780972.0, + "reward": 1.3627232313156128, + "reward_std": 0.2980118989944458, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.18280036747455597, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1079.779052734375, + "completions/mean_terminated_length": 839.7465209960938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.21586489798092803, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.8562240210019115, + "kl": 0.06195068359375, + "learning_rate": 9.639133253732895e-07, + "loss": 0.0978, + "num_tokens": 620343097.0, + "reward": 1.4285714626312256, + "reward_std": 0.3785489499568939, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8928571343421936, + "rewards/tag_count_reward/std": 0.2584463059902191, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 946.6004638671875, + "completions/mean_terminated_length": 769.6917114257812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.216077992648234, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1465146642191623, + "kl": 0.02099609375, + "learning_rate": 9.637818586632957e-07, + "loss": 0.1322, + "num_tokens": 620836006.0, + "reward": 1.4140626192092896, + "reward_std": 0.33451762795448303, + "rewards/accuracy_reward/mean": 0.5208333134651184, + "rewards/accuracy_reward/std": 0.5001450181007385, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.22998054325580597, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1085.125, + "completions/mean_terminated_length": 819.0313110351562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21629108731553992, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12907170009071153, + "kl": 0.018890380859375, + "learning_rate": 9.636501629567153e-07, + "loss": 0.1305, + "num_tokens": 621388958.0, + "reward": 1.4029018878936768, + "reward_std": 0.35832539200782776, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22654591500759125, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 987.4598388671875, + "completions/mean_terminated_length": 835.9540405273438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.21650418198284588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13506395520538472, + "kl": 0.02105712890625, + "learning_rate": 9.635182383264322e-07, + "loss": 0.0484, + "num_tokens": 621908268.0, + "reward": 1.4944196939468384, + "reward_std": 0.4193144738674164, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.20802249014377594, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 904.10498046875, + "completions/mean_terminated_length": 740.6912841796875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.21671727665015184, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12805935434893032, + "kl": 0.021331787109375, + "learning_rate": 9.63386084845456e-07, + "loss": 0.0827, + "num_tokens": 622373787.0, + "reward": 1.6255581378936768, + "reward_std": 0.34637704491615295, + "rewards/accuracy_reward/mean": 0.6941964030265808, + "rewards/accuracy_reward/std": 0.461262047290802, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.1990012526512146, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1033.5670166015625, + "completions/mean_terminated_length": 852.036865234375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2169303713174578, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11895942802725464, + "kl": 0.0186767578125, + "learning_rate": 9.63253702586924e-07, + "loss": 0.0893, + "num_tokens": 622907609.0, + "reward": 1.3844866752624512, + "reward_std": 0.36421677470207214, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.19114895164966583, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 979.2232666015625, + "completions/mean_terminated_length": 820.2769775390625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.21714346598476372, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13699796921985968, + "kl": 0.017364501953125, + "learning_rate": 9.631210916240995e-07, + "loss": 0.0851, + "num_tokens": 623410413.0, + "reward": 1.5195313692092896, + "reward_std": 0.3155477046966553, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.1752651482820511, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 950.982177734375, + "completions/mean_terminated_length": 751.26123046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21735656065206968, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1450168782326325, + "kl": 0.02294921875, + "learning_rate": 9.629882520303726e-07, + "loss": 0.0752, + "num_tokens": 623907637.0, + "reward": 1.4492188692092896, + "reward_std": 0.2938949167728424, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.20309330523014069, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 925.60498046875, + "completions/mean_terminated_length": 752.0386352539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.21756965531937564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15304126104011737, + "kl": 0.022705078125, + "learning_rate": 9.628551838792597e-07, + "loss": 0.0876, + "num_tokens": 624391700.0, + "reward": 1.555803656578064, + "reward_std": 0.3249172568321228, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.16709057986736298, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1062.575927734375, + "completions/mean_terminated_length": 779.4080200195312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2177827499866816, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12268904468243151, + "kl": 0.0164794921875, + "learning_rate": 9.627218872444037e-07, + "loss": 0.0526, + "num_tokens": 624942694.0, + "reward": 1.3794643878936768, + "reward_std": 0.35217374563217163, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9129464030265808, + "rewards/tag_count_reward/std": 0.23698757588863373, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1002.9397583007812, + "completions/mean_terminated_length": 859.7081298828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21799584465398752, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12139979369800806, + "kl": 0.019012451171875, + "learning_rate": 9.625883621995743e-07, + "loss": 0.033, + "num_tokens": 625460747.0, + "reward": 1.5256696939468384, + "reward_std": 0.3746279180049896, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.1673406958580017, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1001.4732666015625, + "completions/mean_terminated_length": 810.9446411132812, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.21820893932129348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13689314803119318, + "kl": 0.0206298828125, + "learning_rate": 9.624546088186677e-07, + "loss": 0.0904, + "num_tokens": 625979487.0, + "reward": 1.4587054252624512, + "reward_std": 0.36251774430274963, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2177339345216751, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1037.290283203125, + "completions/mean_terminated_length": 817.5706787109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.21842203398859944, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12281669816573315, + "kl": 0.0189208984375, + "learning_rate": 9.623206271757056e-07, + "loss": 0.1135, + "num_tokens": 626518673.0, + "reward": 1.3895089626312256, + "reward_std": 0.3988742232322693, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9095982313156128, + "rewards/tag_count_reward/std": 0.24331659078598022, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 966.7991333007812, + "completions/mean_terminated_length": 827.9042358398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2186351286559054, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12982617643749036, + "kl": 0.020721435546875, + "learning_rate": 9.621864173448367e-07, + "loss": 0.0766, + "num_tokens": 627016791.0, + "reward": 1.4626116752624512, + "reward_std": 0.34920039772987366, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.20491690933704376, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1091.966552734375, + "completions/mean_terminated_length": 861.5650634765625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.21884822332321133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12834424542553474, + "kl": 0.01690673828125, + "learning_rate": 9.620519794003362e-07, + "loss": 0.1199, + "num_tokens": 627576568.0, + "reward": 1.364397406578064, + "reward_std": 0.33898791670799255, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9045758843421936, + "rewards/tag_count_reward/std": 0.23102784156799316, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 973.0089721679688, + "completions/mean_terminated_length": 809.9639892578125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.21906131799051728, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3401349518761631, + "kl": 0.0272216796875, + "learning_rate": 9.61917313416605e-07, + "loss": 0.0819, + "num_tokens": 628073740.0, + "reward": 1.5524554252624512, + "reward_std": 0.33791205286979675, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.190138041973114, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1132.2723388671875, + "completions/mean_terminated_length": 892.37744140625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.21927441265782324, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1319965470257071, + "kl": 0.018829345703125, + "learning_rate": 9.617824194681703e-07, + "loss": 0.1119, + "num_tokens": 628648278.0, + "reward": 1.4497768878936768, + "reward_std": 0.35723069310188293, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.24061305820941925, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 937.6428833007812, + "completions/mean_terminated_length": 745.801025390625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2194875073251292, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14290440942335067, + "kl": 0.020538330078125, + "learning_rate": 9.616472976296855e-07, + "loss": 0.13, + "num_tokens": 629138518.0, + "reward": 1.5251116752624512, + "reward_std": 0.4350827634334564, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.2255041003227234, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1029.5491943359375, + "completions/mean_terminated_length": 828.0374755859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.21970060199243513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13104418071881732, + "kl": 0.019287109375, + "learning_rate": 9.615119479759307e-07, + "loss": 0.0817, + "num_tokens": 629665356.0, + "reward": 1.458147406578064, + "reward_std": 0.3374355137348175, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21699519455432892, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1018.15185546875, + "completions/mean_terminated_length": 864.994873046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2199136966597411, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13179287190678912, + "kl": 0.019989013671875, + "learning_rate": 9.61376370581811e-07, + "loss": 0.1198, + "num_tokens": 630189312.0, + "reward": 1.5625001192092896, + "reward_std": 0.4315028488636017, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.22216394543647766, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1005.3795166015625, + "completions/mean_terminated_length": 788.9865112304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22012679132704704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13303082434711633, + "kl": 0.017608642578125, + "learning_rate": 9.612405655223585e-07, + "loss": 0.0692, + "num_tokens": 630704490.0, + "reward": 1.4062501192092896, + "reward_std": 0.3164728283882141, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.208564892411232, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1026.997802734375, + "completions/mean_terminated_length": 875.1564331054688, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.220339885994353, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12440418972766916, + "kl": 0.0181884765625, + "learning_rate": 9.611045328727306e-07, + "loss": 0.0667, + "num_tokens": 631235273.0, + "reward": 1.5072544813156128, + "reward_std": 0.41397711634635925, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.2241603583097458, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1221.544677734375, + "completions/mean_terminated_length": 962.2169799804688, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.22055298066165893, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10698898828875082, + "kl": 0.014739990234375, + "learning_rate": 9.609682727082115e-07, + "loss": 0.0715, + "num_tokens": 631861277.0, + "reward": 1.380022406578064, + "reward_std": 0.4156619608402252, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.24660564959049225, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1229.2679443359375, + "completions/mean_terminated_length": 922.8711547851562, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2207660753289649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12908320574076765, + "kl": 0.0152130126953125, + "learning_rate": 9.6083178510421e-07, + "loss": 0.1468, + "num_tokens": 632487733.0, + "reward": 1.28515625, + "reward_std": 0.41717156767845154, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8766741156578064, + "rewards/tag_count_reward/std": 0.27881479263305664, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1030.4710693359375, + "completions/mean_terminated_length": 822.5886840820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22097916999627085, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1318622567323445, + "kl": 0.019256591796875, + "learning_rate": 9.60695070136262e-07, + "loss": 0.1062, + "num_tokens": 633018600.0, + "reward": 1.387834906578064, + "reward_std": 0.3541650176048279, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8722098469734192, + "rewards/tag_count_reward/std": 0.2633313834667206, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1059.2567138671875, + "completions/mean_terminated_length": 837.7349243164062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2211922646635768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13777712246161447, + "kl": 0.018157958984375, + "learning_rate": 9.60558127880029e-07, + "loss": 0.111, + "num_tokens": 633566379.0, + "reward": 1.3883929252624512, + "reward_std": 0.4226250946521759, + "rewards/accuracy_reward/mean": 0.5208333134651184, + "rewards/accuracy_reward/std": 0.5001450181007385, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8861607313156128, + "rewards/tag_count_reward/std": 0.23324955999851227, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1124.8192138671875, + "completions/mean_terminated_length": 845.718017578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22140535933088273, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10934617098519027, + "kl": 0.017120361328125, + "learning_rate": 9.604209584112975e-07, + "loss": 0.051, + "num_tokens": 634141050.0, + "reward": 1.4654018878936768, + "reward_std": 0.37578099966049194, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.23359987139701843, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1105.134033203125, + "completions/mean_terminated_length": 844.56982421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2216184539981887, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1316632638977043, + "kl": 0.020965576171875, + "learning_rate": 9.602835618059808e-07, + "loss": 0.1475, + "num_tokens": 634697958.0, + "reward": 1.5033482313156128, + "reward_std": 0.4102972745895386, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8582589030265808, + "rewards/tag_count_reward/std": 0.26598531007766724, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1044.60498046875, + "completions/mean_terminated_length": 826.4755859375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.22183154866549465, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.2152842543609118, + "kl": 0.0208740234375, + "learning_rate": 9.601459381401167e-07, + "loss": 0.082, + "num_tokens": 635238421.0, + "reward": 1.430803656578064, + "reward_std": 0.44480612874031067, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8638392686843872, + "rewards/tag_count_reward/std": 0.25665283203125, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1017.8348388671875, + "completions/mean_terminated_length": 823.8248901367188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2220446433328006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13736390359173417, + "kl": 0.019500732421875, + "learning_rate": 9.600080874898702e-07, + "loss": 0.0848, + "num_tokens": 635761307.0, + "reward": 1.4090402126312256, + "reward_std": 0.3900696337223053, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.26011165976524353, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1174.328125, + "completions/mean_terminated_length": 865.507568359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22225773800010654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12768329049189028, + "kl": 0.0167083740234375, + "learning_rate": 9.598700099315307e-07, + "loss": 0.1312, + "num_tokens": 636360830.0, + "reward": 1.2488839626312256, + "reward_std": 0.44803357124328613, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8426339030265808, + "rewards/tag_count_reward/std": 0.29785096645355225, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1126.6160888671875, + "completions/mean_terminated_length": 923.2588500976562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2224708326674125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13348343720204275, + "kl": 0.0168609619140625, + "learning_rate": 9.597317055415135e-07, + "loss": 0.0715, + "num_tokens": 636928962.0, + "reward": 1.4481027126312256, + "reward_std": 0.4428902864456177, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8833705186843872, + "rewards/tag_count_reward/std": 0.24562665820121765, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1141.5357666015625, + "completions/mean_terminated_length": 867.4883422851562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.22268392733471845, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1292659598688964, + "kl": 0.0169525146484375, + "learning_rate": 9.595931743963596e-07, + "loss": 0.0929, + "num_tokens": 637513346.0, + "reward": 1.3716518878936768, + "reward_std": 0.37760260701179504, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.23209868371486664, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1021.0870971679688, + "completions/mean_terminated_length": 859.2222290039062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2228970220020244, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1386405943006892, + "kl": 0.01806640625, + "learning_rate": 9.594544165727354e-07, + "loss": 0.1175, + "num_tokens": 638042633.0, + "reward": 1.5345982313156128, + "reward_std": 0.3629516363143921, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348817229270935, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.2197883427143097, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1077.38623046875, + "completions/mean_terminated_length": 859.9262084960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.22311011666933034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12466369630807479, + "kl": 0.018646240234375, + "learning_rate": 9.593154321474326e-07, + "loss": 0.0991, + "num_tokens": 638593014.0, + "reward": 1.434709906578064, + "reward_std": 0.406714528799057, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8900669813156128, + "rewards/tag_count_reward/std": 0.2509413957595825, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 945.8303833007812, + "completions/mean_terminated_length": 752.010498046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2233232113366363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15353207489639883, + "kl": 0.020294189453125, + "learning_rate": 9.591762211973687e-07, + "loss": 0.1243, + "num_tokens": 639073546.0, + "reward": 1.5334821939468384, + "reward_std": 0.41594818234443665, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.4769369065761566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8816964030265808, + "rewards/tag_count_reward/std": 0.2578950524330139, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1115.622802734375, + "completions/mean_terminated_length": 847.6982421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22353630600394225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16457073235828965, + "kl": 0.017852783203125, + "learning_rate": 9.59036783799586e-07, + "loss": 0.1101, + "num_tokens": 639645329.0, + "reward": 1.3113839626312256, + "reward_std": 0.43677186965942383, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8582589030265808, + "rewards/tag_count_reward/std": 0.2857520282268524, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1122.734375, + "completions/mean_terminated_length": 839.4898071289062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2237494006712482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11942225933021121, + "kl": 0.017852783203125, + "learning_rate": 9.588971200312525e-07, + "loss": 0.089, + "num_tokens": 640215850.0, + "reward": 1.3777902126312256, + "reward_std": 0.4159053564071655, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8978794813156128, + "rewards/tag_count_reward/std": 0.24412783980369568, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1123.96875, + "completions/mean_terminated_length": 837.5731201171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22396249533855414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12926572051627483, + "kl": 0.018280029296875, + "learning_rate": 9.587572299696617e-07, + "loss": 0.1074, + "num_tokens": 640785036.0, + "reward": 1.3359376192092896, + "reward_std": 0.41222265362739563, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8761160969734192, + "rewards/tag_count_reward/std": 0.26227980852127075, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1012.5469360351562, + "completions/mean_terminated_length": 879.5289306640625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2241755900058601, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12155619563747203, + "kl": 0.01824951171875, + "learning_rate": 9.586171136922315e-07, + "loss": 0.0569, + "num_tokens": 641308817.0, + "reward": 1.4224331378936768, + "reward_std": 0.3785761594772339, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.21372579038143158, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1213.529052734375, + "completions/mean_terminated_length": 911.6990966796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22438868467316606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12549264496472973, + "kl": 0.01580810546875, + "learning_rate": 9.58476771276506e-07, + "loss": 0.0889, + "num_tokens": 641924478.0, + "reward": 1.3889509439468384, + "reward_std": 0.4436909258365631, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9001116156578064, + "rewards/tag_count_reward/std": 0.2439078539609909, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1090.49560546875, + "completions/mean_terminated_length": 888.6432495117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.224601779340472, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12344296556345123, + "kl": 0.01678466796875, + "learning_rate": 9.583362028001537e-07, + "loss": 0.069, + "num_tokens": 642486396.0, + "reward": 1.3443081378936768, + "reward_std": 0.2917519807815552, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.19568641483783722, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1027.388427734375, + "completions/mean_terminated_length": 896.2770385742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22481487400777794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13365963009169202, + "kl": 0.0179290771484375, + "learning_rate": 9.58195408340969e-07, + "loss": 0.0537, + "num_tokens": 643015610.0, + "reward": 1.5106027126312256, + "reward_std": 0.3467734754085541, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.19157297909259796, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1030.88623046875, + "completions/mean_terminated_length": 778.7326049804688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2250279686750839, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1379582608634983, + "kl": 0.01904296875, + "learning_rate": 9.580543879768702e-07, + "loss": 0.1071, + "num_tokens": 643550487.0, + "reward": 1.5039063692092896, + "reward_std": 0.4127042889595032, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8722098469734192, + "rewards/tag_count_reward/std": 0.2767925560474396, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 982.2857666015625, + "completions/mean_terminated_length": 791.5789794921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22524106334238986, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13965565352135834, + "kl": 0.019622802734375, + "learning_rate": 9.579131417859016e-07, + "loss": 0.094, + "num_tokens": 644057207.0, + "reward": 1.4810268878936768, + "reward_std": 0.37192073464393616, + "rewards/accuracy_reward/mean": 0.5740740895271301, + "rewards/accuracy_reward/std": 0.4950558841228485, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.2109544575214386, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 935.9688110351562, + "completions/mean_terminated_length": 802.5249633789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22545415800969582, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1324908828382511, + "kl": 0.019805908203125, + "learning_rate": 9.57771669846232e-07, + "loss": 0.0627, + "num_tokens": 644542793.0, + "reward": 1.5463169813156128, + "reward_std": 0.4019099175930023, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.2158125787973404, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1066.9754638671875, + "completions/mean_terminated_length": 843.8931884765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22566725267700175, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1147936390042102, + "kl": 0.018341064453125, + "learning_rate": 9.576299722361556e-07, + "loss": 0.0757, + "num_tokens": 645086062.0, + "reward": 1.454241156578064, + "reward_std": 0.3843628168106079, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.21683764457702637, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1111.2857666015625, + "completions/mean_terminated_length": 838.6397705078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2258803473443077, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11575035667430436, + "kl": 0.0166015625, + "learning_rate": 9.574880490340908e-07, + "loss": 0.0576, + "num_tokens": 645650014.0, + "reward": 1.3945313692092896, + "reward_std": 0.33647313714027405, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.19151432812213898, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1125.1273193359375, + "completions/mean_terminated_length": 893.1200561523438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.22609344201161366, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11575299408640558, + "kl": 0.017730712890625, + "learning_rate": 9.573459003185816e-07, + "loss": 0.0793, + "num_tokens": 646223431.0, + "reward": 1.3593751192092896, + "reward_std": 0.3176526427268982, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.21143031120300293, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 972.654052734375, + "completions/mean_terminated_length": 763.3200073242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22630653667891962, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11779861628180471, + "kl": 0.017669677734375, + "learning_rate": 9.572035261682961e-07, + "loss": 0.0498, + "num_tokens": 646727868.0, + "reward": 1.3733259439468384, + "reward_std": 0.32931098341941833, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.171964630484581, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1058.65185546875, + "completions/mean_terminated_length": 823.61328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.22651963134622555, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12909247656160433, + "kl": 0.0170135498046875, + "learning_rate": 9.570609266620277e-07, + "loss": 0.0931, + "num_tokens": 647266416.0, + "reward": 1.4224331378936768, + "reward_std": 0.3048005998134613, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.19962133467197418, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1001.138427734375, + "completions/mean_terminated_length": 813.8052978515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2267327260135315, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.20340384321624796, + "kl": 0.020965576171875, + "learning_rate": 9.569181018786942e-07, + "loss": 0.1119, + "num_tokens": 647777630.0, + "reward": 1.5318081378936768, + "reward_std": 0.3819688856601715, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.24489878118038177, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1050.638427734375, + "completions/mean_terminated_length": 792.8932495117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22694582068083746, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12182738191951983, + "kl": 0.01837158203125, + "learning_rate": 9.567750518973384e-07, + "loss": 0.0405, + "num_tokens": 648324412.0, + "reward": 1.352678656578064, + "reward_std": 0.3457717299461365, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.21386082470417023, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1005.5870971679688, + "completions/mean_terminated_length": 785.8351440429688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.22715891534814342, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1338490146706846, + "kl": 0.01898193359375, + "learning_rate": 9.566317767971272e-07, + "loss": 0.1153, + "num_tokens": 648843107.0, + "reward": 1.4598214626312256, + "reward_std": 0.3113630712032318, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.21618320047855377, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1007.8795166015625, + "completions/mean_terminated_length": 805.4026489257812, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.22737201001544935, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12914294594114475, + "kl": 0.019622802734375, + "learning_rate": 9.564882766573525e-07, + "loss": 0.0663, + "num_tokens": 649360093.0, + "reward": 1.4486607313156128, + "reward_std": 0.36453840136528015, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.21438556909561157, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 973.4531860351562, + "completions/mean_terminated_length": 787.7984619140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2275851046827553, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13327261206914826, + "kl": 0.017669677734375, + "learning_rate": 9.563445515574307e-07, + "loss": 0.0752, + "num_tokens": 649876968.0, + "reward": 1.3398438692092896, + "reward_std": 0.34925320744514465, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21955746412277222, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1050.54248046875, + "completions/mean_terminated_length": 853.1845092773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22779819935006126, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1338623522910935, + "kl": 0.0163421630859375, + "learning_rate": 9.562006015769027e-07, + "loss": 0.0753, + "num_tokens": 650422219.0, + "reward": 1.4235491752624512, + "reward_std": 0.36818650364875793, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.22267401218414307, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 960.997802734375, + "completions/mean_terminated_length": 812.0177612304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22801129401736722, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13236301406448922, + "kl": 0.021240234375, + "learning_rate": 9.560564267954338e-07, + "loss": 0.0922, + "num_tokens": 650920442.0, + "reward": 1.6032366752624512, + "reward_std": 0.37877747416496277, + "rewards/accuracy_reward/mean": 0.6919642686843872, + "rewards/accuracy_reward/std": 0.46219751238822937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.2232842743396759, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1049.1429443359375, + "completions/mean_terminated_length": 851.508056640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22822438868467318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1470817554146858, + "kl": 0.017730712890625, + "learning_rate": 9.559120272928135e-07, + "loss": 0.0915, + "num_tokens": 651461130.0, + "reward": 1.3738839626312256, + "reward_std": 0.34070566296577454, + "rewards/accuracy_reward/mean": 0.46990740299224854, + "rewards/accuracy_reward/std": 0.4996722936630249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.2131679803133011, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1104.49560546875, + "completions/mean_terminated_length": 850.577880859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2284374833519791, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11615639755764579, + "kl": 0.015960693359375, + "learning_rate": 9.557674031489563e-07, + "loss": 0.0711, + "num_tokens": 652022536.0, + "reward": 1.3426339626312256, + "reward_std": 0.33037620782852173, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.21992461383342743, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1003.2366333007812, + "completions/mean_terminated_length": 779.5609741210938, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.22865057801928507, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12237143573072615, + "kl": 0.019012451171875, + "learning_rate": 9.556225544438998e-07, + "loss": 0.0288, + "num_tokens": 652540594.0, + "reward": 1.4687501192092896, + "reward_std": 0.3836686909198761, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.21600989997386932, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1162.3616943359375, + "completions/mean_terminated_length": 897.9536743164062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.22886367268659102, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11945904960646864, + "kl": 0.0159759521484375, + "learning_rate": 9.554774812578078e-07, + "loss": 0.0656, + "num_tokens": 653130292.0, + "reward": 1.32421875, + "reward_std": 0.33299005031585693, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8934151530265808, + "rewards/tag_count_reward/std": 0.2616334557533264, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1072.7098388671875, + "completions/mean_terminated_length": 806.7216186523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22907676735389698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12838378433575587, + "kl": 0.018402099609375, + "learning_rate": 9.55332183670966e-07, + "loss": 0.0967, + "num_tokens": 653678546.0, + "reward": 1.4882813692092896, + "reward_std": 0.3962567150592804, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9079241156578064, + "rewards/tag_count_reward/std": 0.2347799688577652, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1059.3660888671875, + "completions/mean_terminated_length": 882.4526977539062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.2292898620212029, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13060781021638646, + "kl": 0.019561767578125, + "learning_rate": 9.551866617637863e-07, + "loss": 0.0806, + "num_tokens": 654224070.0, + "reward": 1.5502232313156128, + "reward_std": 0.3903213143348694, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.21642272174358368, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 966.029052734375, + "completions/mean_terminated_length": 788.9791870117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22950295668850887, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12991373348762655, + "kl": 0.01898193359375, + "learning_rate": 9.550409156168037e-07, + "loss": 0.0577, + "num_tokens": 654727139.0, + "reward": 1.4726563692092896, + "reward_std": 0.33379653096199036, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.22768032550811768, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1143.8125, + "completions/mean_terminated_length": 870.4534912109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22971605135581483, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12163567022458427, + "kl": 0.01654052734375, + "learning_rate": 9.548949453106776e-07, + "loss": 0.0917, + "num_tokens": 655315711.0, + "reward": 1.3504464626312256, + "reward_std": 0.3578961491584778, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.20881615579128265, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1000.560302734375, + "completions/mean_terminated_length": 786.5671997070312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.22992914602312078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13204220851346132, + "kl": 0.017852783203125, + "learning_rate": 9.547487509261913e-07, + "loss": 0.1097, + "num_tokens": 655841770.0, + "reward": 1.4196429252624512, + "reward_std": 0.37791186571121216, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.21985933184623718, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1030.2098388671875, + "completions/mean_terminated_length": 818.9703369140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2301422406904267, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12427325355228985, + "kl": 0.016754150390625, + "learning_rate": 9.546023325442523e-07, + "loss": 0.0804, + "num_tokens": 656373208.0, + "reward": 1.4157366752624512, + "reward_std": 0.3472166359424591, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.16643086075782776, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 965.0089721679688, + "completions/mean_terminated_length": 736.7026977539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23035533535773267, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14395707121049647, + "kl": 0.019256591796875, + "learning_rate": 9.544556902458919e-07, + "loss": 0.086, + "num_tokens": 656883660.0, + "reward": 1.4888393878936768, + "reward_std": 0.3880871832370758, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.194285050034523, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 944.482177734375, + "completions/mean_terminated_length": 757.2010498046875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.23056843002503863, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1435070377985579, + "kl": 0.020538330078125, + "learning_rate": 9.543088241122653e-07, + "loss": 0.0696, + "num_tokens": 657375604.0, + "reward": 1.4665179252624512, + "reward_std": 0.3584713339805603, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.19695264101028442, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1029.8973388671875, + "completions/mean_terminated_length": 831.7066650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2307815246923446, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1241564280155971, + "kl": 0.01812744140625, + "learning_rate": 9.541617342246518e-07, + "loss": 0.0921, + "num_tokens": 657906534.0, + "reward": 1.4157366752624512, + "reward_std": 0.3852246403694153, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21015624701976776, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 941.5826416015625, + "completions/mean_terminated_length": 777.0385131835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.23099461935965052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13440716688540302, + "kl": 0.02044677734375, + "learning_rate": 9.540144206644545e-07, + "loss": 0.1254, + "num_tokens": 658394827.0, + "reward": 1.5239956378936768, + "reward_std": 0.33290010690689087, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19590957462787628, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1082.9576416015625, + "completions/mean_terminated_length": 856.9834594726562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.23120771402695647, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17389404479686382, + "kl": 0.02276611328125, + "learning_rate": 9.538668835131996e-07, + "loss": 0.0912, + "num_tokens": 658950504.0, + "reward": 1.4168527126312256, + "reward_std": 0.3355737328529358, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.1893114298582077, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 980.72998046875, + "completions/mean_terminated_length": 752.2357788085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23142080869426243, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12917280431621653, + "kl": 0.01849365234375, + "learning_rate": 9.537191228525382e-07, + "loss": 0.0623, + "num_tokens": 659454095.0, + "reward": 1.4179688692092896, + "reward_std": 0.3551293909549713, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.2071828842163086, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1012.7344360351562, + "completions/mean_terminated_length": 722.8599853515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2316339033615684, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 6.093094122262371, + "kl": 0.2044677734375, + "learning_rate": 9.535711387642447e-07, + "loss": 0.0984, + "num_tokens": 659981704.0, + "reward": 1.4921876192092896, + "reward_std": 0.33223626017570496, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.2020309567451477, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 930.216552734375, + "completions/mean_terminated_length": 737.0916137695312, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.23184699802887432, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12818750344132163, + "kl": 0.019744873046875, + "learning_rate": 9.534229313302163e-07, + "loss": 0.0599, + "num_tokens": 660469737.0, + "reward": 1.4179688692092896, + "reward_std": 0.39471009373664856, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21606117486953735, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 997.794677734375, + "completions/mean_terminated_length": 800.0106201171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23206009269618028, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13105103456407677, + "kl": 0.019744873046875, + "learning_rate": 9.532745006324749e-07, + "loss": 0.084, + "num_tokens": 660987117.0, + "reward": 1.5005581378936768, + "reward_std": 0.3536769449710846, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.17670518159866333, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 969.2031860351562, + "completions/mean_terminated_length": 808.7666625976562, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.23227318736348623, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14048518410823574, + "kl": 0.019561767578125, + "learning_rate": 9.531258467531656e-07, + "loss": 0.0611, + "num_tokens": 661496296.0, + "reward": 1.5412946939468384, + "reward_std": 0.3625069260597229, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.1981375813484192, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 952.49560546875, + "completions/mean_terminated_length": 742.7180786132812, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.2324862820307922, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13248315495935378, + "kl": 0.020782470703125, + "learning_rate": 9.529769697745566e-07, + "loss": 0.108, + "num_tokens": 661988518.0, + "reward": 1.5725446939468384, + "reward_std": 0.3769925534725189, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.20643207430839539, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1059.69873046875, + "completions/mean_terminated_length": 818.1138916015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23269937669809812, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11587636675394795, + "kl": 0.018646240234375, + "learning_rate": 9.5282786977904e-07, + "loss": 0.0924, + "num_tokens": 662530895.0, + "reward": 1.4520089626312256, + "reward_std": 0.3783363997936249, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22406357526779175, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 957.529052734375, + "completions/mean_terminated_length": 795.3564453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23291247136540408, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1532105867200508, + "kl": 0.021697998046875, + "learning_rate": 9.526785468491315e-07, + "loss": 0.0452, + "num_tokens": 663030572.0, + "reward": 1.4135044813156128, + "reward_std": 0.36551082134246826, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21910780668258667, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1068.83935546875, + "completions/mean_terminated_length": 875.1016235351562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23312556603271004, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.48233333838817555, + "kl": 0.0185546875, + "learning_rate": 9.525290010674696e-07, + "loss": 0.1089, + "num_tokens": 663578948.0, + "reward": 1.3811384439468384, + "reward_std": 0.2970651686191559, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19375662505626678, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 981.6585083007812, + "completions/mean_terminated_length": 763.8037719726562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.233338660700016, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1976339788494781, + "kl": 0.027923583984375, + "learning_rate": 9.523792325168168e-07, + "loss": 0.0726, + "num_tokens": 664086027.0, + "reward": 1.462053656578064, + "reward_std": 0.3783024847507477, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.2176220864057541, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1017.1317138671875, + "completions/mean_terminated_length": 809.8526000976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23355175536732192, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13112878736351327, + "kl": 0.02001953125, + "learning_rate": 9.522292412800582e-07, + "loss": 0.0831, + "num_tokens": 664609462.0, + "reward": 1.3844866752624512, + "reward_std": 0.27677157521247864, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18447525799274445, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1093.8795166015625, + "completions/mean_terminated_length": 809.026123046875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.23376485003462788, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12394259257550978, + "kl": 0.01763916015625, + "learning_rate": 9.520790274402025e-07, + "loss": 0.0929, + "num_tokens": 665165360.0, + "reward": 1.4693081378936768, + "reward_std": 0.36182984709739685, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.1947399377822876, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1000.1250610351562, + "completions/mean_terminated_length": 806.0740356445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23397794470193384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.137832899404671, + "kl": 0.019287109375, + "learning_rate": 9.519285910803816e-07, + "loss": 0.0837, + "num_tokens": 665679176.0, + "reward": 1.5742188692092896, + "reward_std": 0.3561047911643982, + "rewards/accuracy_reward/mean": 0.6696428656578064, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2343062311410904, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1062.10498046875, + "completions/mean_terminated_length": 882.6148071289062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.2341910393692398, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16347228099305386, + "kl": 0.0167236328125, + "learning_rate": 9.517779322838506e-07, + "loss": 0.0772, + "num_tokens": 666230551.0, + "reward": 1.4436384439468384, + "reward_std": 0.380472332239151, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.18789489567279816, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1060.7701416015625, + "completions/mean_terminated_length": 795.0849609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23440413403654572, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1272744486399644, + "kl": 0.01922607421875, + "learning_rate": 9.516270511339877e-07, + "loss": 0.0368, + "num_tokens": 666771168.0, + "reward": 1.4564732313156128, + "reward_std": 0.2963971495628357, + "rewards/accuracy_reward/mean": 0.5509259104728699, + "rewards/accuracy_reward/std": 0.49797651171684265, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.20531632006168365, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 950.7522583007812, + "completions/mean_terminated_length": 723.0215454101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.23461722870385168, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15090076382920362, + "kl": 0.02001953125, + "learning_rate": 9.514759477142936e-07, + "loss": 0.0994, + "num_tokens": 667258497.0, + "reward": 1.5496652126312256, + "reward_std": 0.32379835844039917, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23282569646835327, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1077.5023193359375, + "completions/mean_terminated_length": 816.3201293945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.23483032337115764, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12612378405274122, + "kl": 0.018890380859375, + "learning_rate": 9.513246221083932e-07, + "loss": 0.1023, + "num_tokens": 667812834.0, + "reward": 1.2232143878936768, + "reward_std": 0.3761574625968933, + "rewards/accuracy_reward/mean": 0.3169642984867096, + "rewards/accuracy_reward/std": 0.4658135175704956, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.22216394543647766, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1134.65625, + "completions/mean_terminated_length": 895.3858642578125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2350434180384636, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12488737895319843, + "kl": 0.0155487060546875, + "learning_rate": 9.51173074400033e-07, + "loss": 0.1017, + "num_tokens": 668393864.0, + "reward": 1.3275669813156128, + "reward_std": 0.36529234051704407, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.23073045909404755, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1240.8304443359375, + "completions/mean_terminated_length": 910.8552856445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23525651270576953, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12567540433915395, + "kl": 0.0176544189453125, + "learning_rate": 9.510213046730833e-07, + "loss": 0.1129, + "num_tokens": 669021516.0, + "reward": 1.3013393878936768, + "reward_std": 0.39001351594924927, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8705357313156128, + "rewards/tag_count_reward/std": 0.27978527545928955, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1148.2366943359375, + "completions/mean_terminated_length": 925.1754760742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23546960737307548, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11805506609562257, + "kl": 0.0167083740234375, + "learning_rate": 9.508693130115372e-07, + "loss": 0.0716, + "num_tokens": 669611718.0, + "reward": 1.4369419813156128, + "reward_std": 0.43341225385665894, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9012276530265808, + "rewards/tag_count_reward/std": 0.24264031648635864, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1048.5335693359375, + "completions/mean_terminated_length": 896.9434204101562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.23568270204038144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1370970954546268, + "kl": 0.0194091796875, + "learning_rate": 9.507170994995101e-07, + "loss": 0.0947, + "num_tokens": 670150869.0, + "reward": 1.5195313692092896, + "reward_std": 0.41213858127593994, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.22474436461925507, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1184.0670166015625, + "completions/mean_terminated_length": 954.6610107421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2358957967076874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11874730087574582, + "kl": 0.0170745849609375, + "learning_rate": 9.505646642212405e-07, + "loss": 0.0998, + "num_tokens": 670751667.0, + "reward": 1.4447544813156128, + "reward_std": 0.38637077808380127, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9045758843421936, + "rewards/tag_count_reward/std": 0.2273675799369812, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1144.841552734375, + "completions/mean_terminated_length": 875.2029418945312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.23610889137499333, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1161403916441395, + "kl": 0.016937255859375, + "learning_rate": 9.504120072610904e-07, + "loss": 0.1007, + "num_tokens": 671338172.0, + "reward": 1.3364956378936768, + "reward_std": 0.4597889482975006, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8833705186843872, + "rewards/tag_count_reward/std": 0.2616143822669983, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 996.6585083007812, + "completions/mean_terminated_length": 821.4349365234375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2363219860422993, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1214782776680158, + "kl": 0.01959228515625, + "learning_rate": 9.502591287035428e-07, + "loss": 0.0707, + "num_tokens": 671844355.0, + "reward": 1.5931919813156128, + "reward_std": 0.2933616638183594, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.17478010058403015, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 985.6428833007812, + "completions/mean_terminated_length": 805.3472900390625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.23653508070960524, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2651676553697034, + "kl": 0.026123046875, + "learning_rate": 9.501060286332048e-07, + "loss": 0.0579, + "num_tokens": 672355939.0, + "reward": 1.5083706378936768, + "reward_std": 0.3285157084465027, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1929948478937149, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1115.01123046875, + "completions/mean_terminated_length": 857.1766357421875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2367481753769112, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12933324792104972, + "kl": 0.01837158203125, + "learning_rate": 9.499527071348056e-07, + "loss": 0.0954, + "num_tokens": 672923048.0, + "reward": 1.3867188692092896, + "reward_std": 0.38290297985076904, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.8733258843421936, + "rewards/tag_count_reward/std": 0.2468990832567215, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1080.263427734375, + "completions/mean_terminated_length": 791.344970703125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.23696127004421713, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12221739273146501, + "kl": 0.016937255859375, + "learning_rate": 9.497991642931966e-07, + "loss": 0.0424, + "num_tokens": 673475342.0, + "reward": 1.3911831378936768, + "reward_std": 0.359164834022522, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.20491689443588257, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1020.0245971679688, + "completions/mean_terminated_length": 848.6953125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2371743647115231, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13417200527324016, + "kl": 0.017822265625, + "learning_rate": 9.496454001933522e-07, + "loss": 0.0761, + "num_tokens": 674002297.0, + "reward": 1.4185268878936768, + "reward_std": 0.3305349349975586, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9341517686843872, + "rewards/tag_count_reward/std": 0.18429669737815857, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1176.296875, + "completions/mean_terminated_length": 912.7587280273438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.23738745937882905, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10479471547803462, + "kl": 0.0163116455078125, + "learning_rate": 9.494914149203691e-07, + "loss": 0.0311, + "num_tokens": 674597518.0, + "reward": 1.3035714626312256, + "reward_std": 0.3514695465564728, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.2151644378900528, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1046.3951416015625, + "completions/mean_terminated_length": 815.2554931640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.237600554046135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13787646103822634, + "kl": 0.018768310546875, + "learning_rate": 9.493372085594664e-07, + "loss": 0.1247, + "num_tokens": 675132799.0, + "reward": 1.2566964626312256, + "reward_std": 0.3821662664413452, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8705357313156128, + "rewards/tag_count_reward/std": 0.2519102394580841, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1071.8482666015625, + "completions/mean_terminated_length": 843.2727661132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23781364871344093, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12289956267841243, + "kl": 0.0166015625, + "learning_rate": 9.491827811959852e-07, + "loss": 0.0768, + "num_tokens": 675679723.0, + "reward": 1.4330357313156128, + "reward_std": 0.4378242492675781, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.2271430343389511, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 931.513427734375, + "completions/mean_terminated_length": 755.5297241210938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2380267433807469, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12399650810657688, + "kl": 0.0185546875, + "learning_rate": 9.490281329153895e-07, + "loss": 0.0511, + "num_tokens": 676164289.0, + "reward": 1.5452009439468384, + "reward_std": 0.36436668038368225, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.1962151974439621, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1058.18310546875, + "completions/mean_terminated_length": 777.404052734375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.23823983804805285, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13645637945747782, + "kl": 0.01806640625, + "learning_rate": 9.488732638032653e-07, + "loss": 0.0754, + "num_tokens": 676702611.0, + "reward": 1.3470982313156128, + "reward_std": 0.3205311894416809, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.26109668612480164, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1110.9285888671875, + "completions/mean_terminated_length": 862.1016845703125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2384529327153588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13171320101676276, + "kl": 0.017578125, + "learning_rate": 9.487181739453207e-07, + "loss": 0.1547, + "num_tokens": 677264259.0, + "reward": 1.3258929252624512, + "reward_std": 0.4249982237815857, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.25470954179763794, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1161.7723388671875, + "completions/mean_terminated_length": 897.1884155273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23866602738266474, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1179876710063193, + "kl": 0.01617431640625, + "learning_rate": 9.485628634273861e-07, + "loss": 0.0974, + "num_tokens": 677861421.0, + "reward": 1.3258929252624512, + "reward_std": 0.3822091221809387, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8616071343421936, + "rewards/tag_count_reward/std": 0.26722386479377747, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1164.03125, + "completions/mean_terminated_length": 840.6279907226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2388791220499707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1258067203278698, + "kl": 0.017181396484375, + "learning_rate": 9.484073323354139e-07, + "loss": 0.1062, + "num_tokens": 678449467.0, + "reward": 1.3593751192092896, + "reward_std": 0.4407947063446045, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8683035969734192, + "rewards/tag_count_reward/std": 0.2721400260925293, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1006.9375610351562, + "completions/mean_terminated_length": 794.247314453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23909221671727665, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13871572209995917, + "kl": 0.01922607421875, + "learning_rate": 9.482515807554788e-07, + "loss": 0.098, + "num_tokens": 678970575.0, + "reward": 1.3978794813156128, + "reward_std": 0.33868899941444397, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8666294813156128, + "rewards/tag_count_reward/std": 0.2567592263221741, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1040.94873046875, + "completions/mean_terminated_length": 848.1090087890625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2393053113845826, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12808558757991312, + "kl": 0.018035888671875, + "learning_rate": 9.480956087737774e-07, + "loss": 0.0444, + "num_tokens": 679511032.0, + "reward": 1.4966518878936768, + "reward_std": 0.35735705494880676, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.2142428457736969, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1223.6785888671875, + "completions/mean_terminated_length": 928.9212036132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23951840605188854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12305518503553173, + "kl": 0.01556396484375, + "learning_rate": 9.479394164766281e-07, + "loss": 0.1285, + "num_tokens": 680134200.0, + "reward": 1.356584906578064, + "reward_std": 0.46816831827163696, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8543526530265808, + "rewards/tag_count_reward/std": 0.2810535132884979, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1021.6652221679688, + "completions/mean_terminated_length": 815.297607421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2397315007191945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1374047914609163, + "kl": 0.017791748046875, + "learning_rate": 9.477830039504714e-07, + "loss": 0.0918, + "num_tokens": 680663634.0, + "reward": 1.4910714626312256, + "reward_std": 0.3470616936683655, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8928571343421936, + "rewards/tag_count_reward/std": 0.23699811100959778, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1131.44873046875, + "completions/mean_terminated_length": 854.3517456054688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.23994459538650045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.129918987632575, + "kl": 0.016326904296875, + "learning_rate": 9.476263712818698e-07, + "loss": 0.1077, + "num_tokens": 681239659.0, + "reward": 1.3431919813156128, + "reward_std": 0.38743600249290466, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8989955186843872, + "rewards/tag_count_reward/std": 0.2399759739637375, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1028.294677734375, + "completions/mean_terminated_length": 786.0442504882812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2401576900538064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14134344131680723, + "kl": 0.018707275390625, + "learning_rate": 9.474695185575072e-07, + "loss": 0.1402, + "num_tokens": 681765375.0, + "reward": 1.5033482313156128, + "reward_std": 0.41235625743865967, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.8917410969734192, + "rewards/tag_count_reward/std": 0.24859534204006195, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1128.15625, + "completions/mean_terminated_length": 863.8333129882812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.24037078472111234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12919158868521846, + "kl": 0.01788330078125, + "learning_rate": 9.473124458641901e-07, + "loss": 0.0785, + "num_tokens": 682338389.0, + "reward": 1.450334906578064, + "reward_std": 0.43026217818260193, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8744419813156128, + "rewards/tag_count_reward/std": 0.2747781574726105, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1157.46875, + "completions/mean_terminated_length": 860.625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2405838793884183, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12335542332175008, + "kl": 0.016571044921875, + "learning_rate": 9.471551532888456e-07, + "loss": 0.107, + "num_tokens": 682917095.0, + "reward": 1.4068081378936768, + "reward_std": 0.3423016369342804, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8934151530265808, + "rewards/tag_count_reward/std": 0.2512744665145874, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1014.9777221679688, + "completions/mean_terminated_length": 830.12109375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.24079697405572426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13719931551206793, + "kl": 0.016845703125, + "learning_rate": 9.469976409185235e-07, + "loss": 0.0741, + "num_tokens": 683450349.0, + "reward": 1.3962054252624512, + "reward_std": 0.34097760915756226, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.19511540234088898, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1255.7366943359375, + "completions/mean_terminated_length": 945.7205200195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2410100687230302, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11326073734368769, + "kl": 0.0157928466796875, + "learning_rate": 9.468399088403948e-07, + "loss": 0.0575, + "num_tokens": 684083959.0, + "reward": 1.2410714626312256, + "reward_std": 0.3917922377586365, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8683035969734192, + "rewards/tag_count_reward/std": 0.2736770510673523, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1124.88623046875, + "completions/mean_terminated_length": 908.7300415039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24122316339033617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.126178465595855, + "kl": 0.017486572265625, + "learning_rate": 9.466819571417519e-07, + "loss": 0.1059, + "num_tokens": 684655844.0, + "reward": 1.4419643878936768, + "reward_std": 0.3981620967388153, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.2214886099100113, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1139.59375, + "completions/mean_terminated_length": 908.0392456054688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2414362580576421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12362567164885911, + "kl": 0.020111083984375, + "learning_rate": 9.465237859100093e-07, + "loss": 0.0984, + "num_tokens": 685228798.0, + "reward": 1.3883929252624512, + "reward_std": 0.3500884771347046, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8683035969734192, + "rewards/tag_count_reward/std": 0.26799824833869934, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1151.28125, + "completions/mean_terminated_length": 909.9546508789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24164935272494806, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11896610908952401, + "kl": 0.01885986328125, + "learning_rate": 9.463653952327024e-07, + "loss": 0.066, + "num_tokens": 685814556.0, + "reward": 1.4274554252624512, + "reward_std": 0.3754560053348541, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9029017686843872, + "rewards/tag_count_reward/std": 0.2412969470024109, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1129.74560546875, + "completions/mean_terminated_length": 845.140380859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24186244739225402, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12633658538528192, + "kl": 0.015045166015625, + "learning_rate": 9.462067851974886e-07, + "loss": 0.1229, + "num_tokens": 686396874.0, + "reward": 1.4157366752624512, + "reward_std": 0.3558761477470398, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23101703822612762, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1116.212158203125, + "completions/mean_terminated_length": 862.0880737304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24207554205955997, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11538481927029474, + "kl": 0.017913818359375, + "learning_rate": 9.460479558921459e-07, + "loss": 0.0911, + "num_tokens": 686966553.0, + "reward": 1.3593751192092896, + "reward_std": 0.4412151873111725, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.24347303807735443, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1131.2410888671875, + "completions/mean_terminated_length": 871.186279296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2422886367268659, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12929923824805095, + "kl": 0.016876220703125, + "learning_rate": 9.458889074045747e-07, + "loss": 0.0747, + "num_tokens": 687538645.0, + "reward": 1.4843751192092896, + "reward_std": 0.37626728415489197, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9084821343421936, + "rewards/tag_count_reward/std": 0.2329067587852478, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1048.044677734375, + "completions/mean_terminated_length": 843.752685546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24250173139417186, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12517031114912322, + "kl": 0.019561767578125, + "learning_rate": 9.45729639822796e-07, + "loss": 0.0793, + "num_tokens": 688073977.0, + "reward": 1.493303656578064, + "reward_std": 0.3122672140598297, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19207492470741272, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1004.9219360351562, + "completions/mean_terminated_length": 818.2658081054688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24271482606147782, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13755099713409896, + "kl": 0.020721435546875, + "learning_rate": 9.455701532349522e-07, + "loss": 0.1012, + "num_tokens": 688594166.0, + "reward": 1.419084906578064, + "reward_std": 0.3213981091976166, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.20244067907333374, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1075.46435546875, + "completions/mean_terminated_length": 820.6873168945312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.24292792072878377, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13788831577203978, + "kl": 0.02099609375, + "learning_rate": 9.454104477293068e-07, + "loss": 0.1142, + "num_tokens": 689141958.0, + "reward": 1.3989956378936768, + "reward_std": 0.37277212738990784, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8833705186843872, + "rewards/tag_count_reward/std": 0.2545716464519501, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 970.4888916015625, + "completions/mean_terminated_length": 781.0052490234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2431410153960897, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1361819506275324, + "kl": 0.019134521484375, + "learning_rate": 9.452505233942447e-07, + "loss": 0.078, + "num_tokens": 689642689.0, + "reward": 1.4754464626312256, + "reward_std": 0.3517928421497345, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.19571910798549652, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1057.419677734375, + "completions/mean_terminated_length": 848.5946044921875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.24335411006339566, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11490567076191115, + "kl": 0.0169677734375, + "learning_rate": 9.450903803182717e-07, + "loss": 0.1029, + "num_tokens": 690188397.0, + "reward": 1.3247768878936768, + "reward_std": 0.43005073070526123, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.25505974888801575, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1085.4576416015625, + "completions/mean_terminated_length": 853.4874877929688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.24356720473070162, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11883591467647857, + "kl": 0.018524169921875, + "learning_rate": 9.449300185900149e-07, + "loss": 0.1034, + "num_tokens": 690741882.0, + "reward": 1.4029018878936768, + "reward_std": 0.36611419916152954, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.2140796184539795, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1023.9866333007812, + "completions/mean_terminated_length": 821.3743286132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24378029939800758, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11420225374750567, + "kl": 0.018218994140625, + "learning_rate": 9.447694382982221e-07, + "loss": 0.0751, + "num_tokens": 691267556.0, + "reward": 1.3856027126312256, + "reward_std": 0.32527297735214233, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.18088066577911377, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1120.200927734375, + "completions/mean_terminated_length": 877.1436157226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2439933940653135, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12201012108747257, + "kl": 0.0174407958984375, + "learning_rate": 9.446086395317622e-07, + "loss": 0.0887, + "num_tokens": 691833566.0, + "reward": 1.3844866752624512, + "reward_std": 0.33895665407180786, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.23308825492858887, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1091.88623046875, + "completions/mean_terminated_length": 844.8005981445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24420648873261946, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1170179622713891, + "kl": 0.017425537109375, + "learning_rate": 9.44447622379625e-07, + "loss": 0.0709, + "num_tokens": 692395227.0, + "reward": 1.4402902126312256, + "reward_std": 0.41064462065696716, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.2317454218864441, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1149.8035888671875, + "completions/mean_terminated_length": 850.40478515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24441958339992542, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12010893928556403, + "kl": 0.0166015625, + "learning_rate": 9.442863869309213e-07, + "loss": 0.0646, + "num_tokens": 692982195.0, + "reward": 1.3816964626312256, + "reward_std": 0.34269511699676514, + "rewards/accuracy_reward/mean": 0.48148149251937866, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.2228260338306427, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1135.4888916015625, + "completions/mean_terminated_length": 886.6221923828125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.24463267806723138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6000192959715754, + "kl": 0.05126953125, + "learning_rate": 9.441249332748824e-07, + "loss": 0.0663, + "num_tokens": 693564446.0, + "reward": 1.297991156578064, + "reward_std": 0.3227459192276001, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.22814933955669403, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1029.790283203125, + "completions/mean_terminated_length": 805.0626831054688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2448457727345373, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14467243051835282, + "kl": 0.01776123046875, + "learning_rate": 9.439632615008604e-07, + "loss": 0.1248, + "num_tokens": 694096192.0, + "reward": 1.3493304252624512, + "reward_std": 0.3884987235069275, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.215381920337677, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1156.2835693359375, + "completions/mean_terminated_length": 913.0880737304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24505886740184327, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1319405199713387, + "kl": 0.01885986328125, + "learning_rate": 9.438013716983289e-07, + "loss": 0.0482, + "num_tokens": 694685551.0, + "reward": 1.2751116752624512, + "reward_std": 0.31994858384132385, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.21274222433567047, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1050.013427734375, + "completions/mean_terminated_length": 823.073974609375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.24527196206914922, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12958232570022526, + "kl": 0.01776123046875, + "learning_rate": 9.436392639568809e-07, + "loss": 0.0861, + "num_tokens": 695218021.0, + "reward": 1.3945313692092896, + "reward_std": 0.3540402948856354, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.23448731005191803, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1000.2991333007812, + "completions/mean_terminated_length": 822.4908447265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24548505673645518, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12241726943639346, + "kl": 0.01922607421875, + "learning_rate": 9.434769383662307e-07, + "loss": 0.0279, + "num_tokens": 695735803.0, + "reward": 1.4709821939468384, + "reward_std": 0.3499070107936859, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1687558889389038, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 959.93310546875, + "completions/mean_terminated_length": 775.274169921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2456981514037611, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14022076688205312, + "kl": 0.019073486328125, + "learning_rate": 9.433143950162134e-07, + "loss": 0.116, + "num_tokens": 696236109.0, + "reward": 1.4748884439468384, + "reward_std": 0.3407088816165924, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.19655847549438477, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 949.872802734375, + "completions/mean_terminated_length": 753.3658447265625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.24591124607106707, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13397138516990292, + "kl": 0.02117919921875, + "learning_rate": 9.43151633996784e-07, + "loss": 0.0716, + "num_tokens": 696732244.0, + "reward": 1.4626116752624512, + "reward_std": 0.3569982349872589, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21863439679145813, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 960.8281860351562, + "completions/mean_terminated_length": 752.646240234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24612434073837303, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1448121986343773, + "kl": 0.020355224609375, + "learning_rate": 9.429886553980184e-07, + "loss": 0.1232, + "num_tokens": 697226647.0, + "reward": 1.4508929252624512, + "reward_std": 0.3497101366519928, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.2338162064552307, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1034.0023193359375, + "completions/mean_terminated_length": 820.2405395507812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24633743540567898, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1299296540948816, + "kl": 0.016815185546875, + "learning_rate": 9.428254593101128e-07, + "loss": 0.0688, + "num_tokens": 697761992.0, + "reward": 1.380022406578064, + "reward_std": 0.3787865936756134, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.22577518224716187, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 959.33935546875, + "completions/mean_terminated_length": 777.8958740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2465505300729849, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1420988994117055, + "kl": 0.019744873046875, + "learning_rate": 9.426620458233837e-07, + "loss": 0.1231, + "num_tokens": 698259984.0, + "reward": 1.5429688692092896, + "reward_std": 0.2873460054397583, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.2086479365825653, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 922.6339721679688, + "completions/mean_terminated_length": 745.2506713867188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.24676362474029087, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1277125771130333, + "kl": 0.02081298828125, + "learning_rate": 9.424984150282679e-07, + "loss": 0.0683, + "num_tokens": 698740332.0, + "reward": 1.5145089626312256, + "reward_std": 0.28034472465515137, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.18940123915672302, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1071.80810546875, + "completions/mean_terminated_length": 849.82470703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24697671940759683, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13903483653511328, + "kl": 0.017578125, + "learning_rate": 9.423345670153225e-07, + "loss": 0.1148, + "num_tokens": 699294918.0, + "reward": 1.340959906578064, + "reward_std": 0.37994667887687683, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.22031240165233612, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1073.544677734375, + "completions/mean_terminated_length": 797.1232299804688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2471898140749028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1331404826986215, + "kl": 0.017822265625, + "learning_rate": 9.421705018752252e-07, + "loss": 0.1375, + "num_tokens": 699842362.0, + "reward": 1.4001116752624512, + "reward_std": 0.4011727273464203, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21589355170726776, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1078.279052734375, + "completions/mean_terminated_length": 844.5789184570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24740290874220872, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12522523995887483, + "kl": 0.020538330078125, + "learning_rate": 9.420062196987729e-07, + "loss": 0.0608, + "num_tokens": 700396551.0, + "reward": 1.4927456378936768, + "reward_std": 0.35562044382095337, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960493743419647, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.1941235363483429, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1087.7366943359375, + "completions/mean_terminated_length": 846.32958984375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.24761600340951467, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.17851690284384417, + "kl": 0.024688720703125, + "learning_rate": 9.418417205768836e-07, + "loss": 0.1213, + "num_tokens": 700944913.0, + "reward": 1.2751116752624512, + "reward_std": 0.3589559495449066, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8978794813156128, + "rewards/tag_count_reward/std": 0.25587278604507446, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1046.8460693359375, + "completions/mean_terminated_length": 815.8104858398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24782909807682063, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39339112525832093, + "kl": 0.0181884765625, + "learning_rate": 9.41677004600595e-07, + "loss": 0.1299, + "num_tokens": 701480140.0, + "reward": 1.5295759439468384, + "reward_std": 0.350629061460495, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.21685130894184113, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 988.91748046875, + "completions/mean_terminated_length": 740.9228515625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2480421927441266, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13564337694287523, + "kl": 0.019378662109375, + "learning_rate": 9.415120718610646e-07, + "loss": 0.082, + "num_tokens": 701989143.0, + "reward": 1.4637277126312256, + "reward_std": 0.3330329656600952, + "rewards/accuracy_reward/mean": 0.5717592835426331, + "rewards/accuracy_reward/std": 0.49539753794670105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9123883843421936, + "rewards/tag_count_reward/std": 0.2280583381652832, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1087.796875, + "completions/mean_terminated_length": 862.9559326171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24825528741143252, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11561748260031024, + "kl": 0.018585205078125, + "learning_rate": 9.413469224495701e-07, + "loss": 0.0907, + "num_tokens": 702544012.0, + "reward": 1.4280134439468384, + "reward_std": 0.3665209412574768, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9034598469734192, + "rewards/tag_count_reward/std": 0.23357515037059784, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1060.8795166015625, + "completions/mean_terminated_length": 839.7213134765625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.24846838207873848, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1291688686745437, + "kl": 0.017852783203125, + "learning_rate": 9.41181556457509e-07, + "loss": 0.0828, + "num_tokens": 703082582.0, + "reward": 1.4056919813156128, + "reward_std": 0.35266488790512085, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.2245887815952301, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1045.22998046875, + "completions/mean_terminated_length": 813.8214721679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.24868147674604443, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12649082532650746, + "kl": 0.0171661376953125, + "learning_rate": 9.41015973976399e-07, + "loss": 0.0982, + "num_tokens": 703626109.0, + "reward": 1.4168527126312256, + "reward_std": 0.34334796667099, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.21320530772209167, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 982.9152221679688, + "completions/mean_terminated_length": 754.888916015625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2488945714133504, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12389374550919054, + "kl": 0.01812744140625, + "learning_rate": 9.408501750978769e-07, + "loss": 0.0522, + "num_tokens": 704132567.0, + "reward": 1.3543527126312256, + "reward_std": 0.2990226745605469, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.1695888340473175, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1116.59375, + "completions/mean_terminated_length": 848.9482421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24910766608065632, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12005508477364267, + "kl": 0.0162353515625, + "learning_rate": 9.406841599137e-07, + "loss": 0.0695, + "num_tokens": 704702049.0, + "reward": 1.4414063692092896, + "reward_std": 0.30317384004592896, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18412980437278748, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1076.071533203125, + "completions/mean_terminated_length": 831.7318115234375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.24932076074796228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1393368838053309, + "kl": 0.019378662109375, + "learning_rate": 9.40517928515745e-07, + "loss": 0.067, + "num_tokens": 705253601.0, + "reward": 1.4157366752624512, + "reward_std": 0.3868698179721832, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.21418675780296326, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1040.0960693359375, + "completions/mean_terminated_length": 786.7122802734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24953385541526824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13988853026545434, + "kl": 0.018707275390625, + "learning_rate": 9.403514809960081e-07, + "loss": 0.1051, + "num_tokens": 705792508.0, + "reward": 1.3688616752624512, + "reward_std": 0.3498436510562897, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.2114827185869217, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1134.790283203125, + "completions/mean_terminated_length": 830.3869018554688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2497469500825742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.130087327815181, + "kl": 0.017120361328125, + "learning_rate": 9.401848174466053e-07, + "loss": 0.0665, + "num_tokens": 706371918.0, + "reward": 1.2845982313156128, + "reward_std": 0.40470418334007263, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.24499371647834778, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 970.1094360351562, + "completions/mean_terminated_length": 760.2799682617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.24996004474988012, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11770602808542026, + "kl": 0.017486572265625, + "learning_rate": 9.400179379597721e-07, + "loss": 0.0579, + "num_tokens": 706870303.0, + "reward": 1.387834906578064, + "reward_std": 0.26535508036613464, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19872502982616425, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1069.8973388671875, + "completions/mean_terminated_length": 788.8333129882812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2501731394171861, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11319270923412346, + "kl": 0.01739501953125, + "learning_rate": 9.398508426278637e-07, + "loss": 0.1043, + "num_tokens": 707423409.0, + "reward": 1.4062501192092896, + "reward_std": 0.37307223677635193, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.21254922449588776, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 992.0558471679688, + "completions/mean_terminated_length": 793.1909790039062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.250386234084492, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1334958717509608, + "kl": 0.01873779296875, + "learning_rate": 9.396835315433543e-07, + "loss": 0.1403, + "num_tokens": 707942842.0, + "reward": 1.5390626192092896, + "reward_std": 0.42992103099823, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.23065266013145447, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1025.9263916015625, + "completions/mean_terminated_length": 779.609375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.25059932875179797, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13944002728397867, + "kl": 0.019287109375, + "learning_rate": 9.395160047988379e-07, + "loss": 0.074, + "num_tokens": 708469945.0, + "reward": 1.3521206378936768, + "reward_std": 0.36226117610931396, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.2208217978477478, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1026.1295166015625, + "completions/mean_terminated_length": 820.6595458984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2508124234191039, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12728507298197414, + "kl": 0.017425537109375, + "learning_rate": 9.393482624870281e-07, + "loss": 0.0812, + "num_tokens": 709001987.0, + "reward": 1.4185268878936768, + "reward_std": 0.3376017212867737, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9341517686843872, + "rewards/tag_count_reward/std": 0.19246125221252441, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1014.8772583007812, + "completions/mean_terminated_length": 817.045166015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2510255180864099, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13364980461477466, + "kl": 0.0184326171875, + "learning_rate": 9.391803047007567e-07, + "loss": 0.0535, + "num_tokens": 709527084.0, + "reward": 1.4107143878936768, + "reward_std": 0.2955436706542969, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.1875898689031601, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1134.2835693359375, + "completions/mean_terminated_length": 904.5781860351562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25123861275371584, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.113471643875866, + "kl": 0.0142974853515625, + "learning_rate": 9.39012131532976e-07, + "loss": 0.1108, + "num_tokens": 710106267.0, + "reward": 1.4068081378936768, + "reward_std": 0.3691440522670746, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.21175408363342285, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 953.1897583007812, + "completions/mean_terminated_length": 750.4470825195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2514517074210218, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13565280406939878, + "kl": 0.019195556640625, + "learning_rate": 9.388437430767568e-07, + "loss": 0.1158, + "num_tokens": 710592608.0, + "reward": 1.5184152126312256, + "reward_std": 0.3573632538318634, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21910780668258667, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1042.8304443359375, + "completions/mean_terminated_length": 850.3510131835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25166480208832775, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10860656602048822, + "kl": 0.0178375244140625, + "learning_rate": 9.386751394252895e-07, + "loss": -0.0002, + "num_tokens": 711136564.0, + "reward": 1.5055804252624512, + "reward_std": 0.3420940637588501, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.16209600865840912, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1039.09375, + "completions/mean_terminated_length": 829.6981201171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2518778967556337, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11561013449727518, + "kl": 0.01983642578125, + "learning_rate": 9.385063206718826e-07, + "loss": 0.0633, + "num_tokens": 711672670.0, + "reward": 1.5396206378936768, + "reward_std": 0.4023180603981018, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.20518478751182556, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1084.122802734375, + "completions/mean_terminated_length": 821.2471923828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2520909914229396, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12212536123438834, + "kl": 0.018310546875, + "learning_rate": 9.383372869099652e-07, + "loss": 0.0889, + "num_tokens": 712222453.0, + "reward": 1.4324777126312256, + "reward_std": 0.3501165211200714, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.20216916501522064, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1063.857177734375, + "completions/mean_terminated_length": 816.4468994140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25230408609024557, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1422060690030813, + "kl": 0.017974853515625, + "learning_rate": 9.381680382330841e-07, + "loss": 0.0958, + "num_tokens": 712773205.0, + "reward": 1.3744419813156128, + "reward_std": 0.3827194571495056, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9079241156578064, + "rewards/tag_count_reward/std": 0.23238559067249298, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1070.9576416015625, + "completions/mean_terminated_length": 877.6390380859375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.25251718075755153, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12678636513120645, + "kl": 0.0194091796875, + "learning_rate": 9.379985747349056e-07, + "loss": 0.0494, + "num_tokens": 713324434.0, + "reward": 1.4174107313156128, + "reward_std": 0.3997945487499237, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.21109940111637115, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1033.575927734375, + "completions/mean_terminated_length": 848.891845703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2527302754248575, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1291948979286699, + "kl": 0.0198974609375, + "learning_rate": 9.378288965092145e-07, + "loss": 0.0511, + "num_tokens": 713856740.0, + "reward": 1.462053656578064, + "reward_std": 0.3138492703437805, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.18993109464645386, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1124.4398193359375, + "completions/mean_terminated_length": 855.6224365234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25294337009216344, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13215387343909, + "kl": 0.018035888671875, + "learning_rate": 9.376590036499152e-07, + "loss": 0.0893, + "num_tokens": 714439977.0, + "reward": 1.364397406578064, + "reward_std": 0.3469241261482239, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.21394765377044678, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1063.265625, + "completions/mean_terminated_length": 829.3232421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2531564647594694, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11914512375886392, + "kl": 0.01751708984375, + "learning_rate": 9.374888962510302e-07, + "loss": 0.0596, + "num_tokens": 714983792.0, + "reward": 1.4218751192092896, + "reward_std": 0.3380487561225891, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19915880262851715, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1105.6273193359375, + "completions/mean_terminated_length": 862.0927124023438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.25336955942677536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12119205690446866, + "kl": 0.017578125, + "learning_rate": 9.373185744067006e-07, + "loss": 0.0383, + "num_tokens": 715562233.0, + "reward": 1.4977679252624512, + "reward_std": 0.40225204825401306, + "rewards/accuracy_reward/mean": 0.5949074029922485, + "rewards/accuracy_reward/std": 0.49147912859916687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.2016134113073349, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1156.544677734375, + "completions/mean_terminated_length": 900.3793334960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2535826540940813, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12164394110606176, + "kl": 0.0157012939453125, + "learning_rate": 9.371480382111869e-07, + "loss": 0.0766, + "num_tokens": 716144525.0, + "reward": 1.2544643878936768, + "reward_std": 0.3277755677700043, + "rewards/accuracy_reward/mean": 0.3236607015132904, + "rewards/accuracy_reward/std": 0.46839529275894165, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20808550715446472, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1128.6942138671875, + "completions/mean_terminated_length": 881.2889404296875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2537957487613872, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11431480954744451, + "kl": 0.0168304443359375, + "learning_rate": 9.369772877588679e-07, + "loss": 0.0739, + "num_tokens": 716714148.0, + "reward": 1.3939732313156128, + "reward_std": 0.41670045256614685, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8805803656578064, + "rewards/tag_count_reward/std": 0.2616889178752899, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1154.118408203125, + "completions/mean_terminated_length": 890.60400390625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2540088434286932, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12084179441173191, + "kl": 0.017303466796875, + "learning_rate": 9.368063231442404e-07, + "loss": 0.0827, + "num_tokens": 717299593.0, + "reward": 1.4101563692092896, + "reward_std": 0.3786208927631378, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.21001364290714264, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 892.4263916015625, + "completions/mean_terminated_length": 734.0482177734375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.25422193809599913, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15501915749431122, + "kl": 0.021697998046875, + "learning_rate": 9.366351444619207e-07, + "loss": 0.093, + "num_tokens": 717761608.0, + "reward": 1.6021206378936768, + "reward_std": 0.3924897611141205, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.2047584503889084, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1093.5045166015625, + "completions/mean_terminated_length": 873.2362670898438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.2544350327633051, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11945032285356612, + "kl": 0.0194091796875, + "learning_rate": 9.364637518066431e-07, + "loss": 0.0961, + "num_tokens": 718315482.0, + "reward": 1.4040179252624512, + "reward_std": 0.32677170634269714, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.18845300376415253, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1028.546875, + "completions/mean_terminated_length": 816.9622192382812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25464812743061105, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12288813369377476, + "kl": 0.0185546875, + "learning_rate": 9.362921452732598e-07, + "loss": 0.0877, + "num_tokens": 718845743.0, + "reward": 1.5172991752624512, + "reward_std": 0.34804025292396545, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9123883843421936, + "rewards/tag_count_reward/std": 0.2212115377187729, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1042.97998046875, + "completions/mean_terminated_length": 863.13427734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.254861222097917, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1275499782028147, + "kl": 0.020050048828125, + "learning_rate": 9.361203249567424e-07, + "loss": 0.0951, + "num_tokens": 719383430.0, + "reward": 1.4687501192092896, + "reward_std": 0.33225592970848083, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.2053801566362381, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1053.2366943359375, + "completions/mean_terminated_length": 862.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.25507431676522296, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13040743814687228, + "kl": 0.019287109375, + "learning_rate": 9.359482909521802e-07, + "loss": 0.1084, + "num_tokens": 719922608.0, + "reward": 1.442522406578064, + "reward_std": 0.37619295716285706, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.2044655829668045, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1188.607177734375, + "completions/mean_terminated_length": 938.4668579101562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2552874114325289, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12890735174445375, + "kl": 0.01708984375, + "learning_rate": 9.357760433547807e-07, + "loss": 0.1283, + "num_tokens": 720528560.0, + "reward": 1.446428656578064, + "reward_std": 0.3331867754459381, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9084821343421936, + "rewards/tag_count_reward/std": 0.21866366267204285, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1098.029052734375, + "completions/mean_terminated_length": 888.3623657226562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2555005060998349, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11050528131623757, + "kl": 0.017822265625, + "learning_rate": 9.356035822598699e-07, + "loss": 0.0727, + "num_tokens": 721097149.0, + "reward": 1.5301339626312256, + "reward_std": 0.30820971727371216, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9341517686843872, + "rewards/tag_count_reward/std": 0.18505382537841797, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1133.0491943359375, + "completions/mean_terminated_length": 903.0335083007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2557136007671408, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11803966505992211, + "kl": 0.01708984375, + "learning_rate": 9.354309077628918e-07, + "loss": 0.1082, + "num_tokens": 721690483.0, + "reward": 1.3437501192092896, + "reward_std": 0.3707781136035919, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.2252444326877594, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1081.625, + "completions/mean_terminated_length": 848.7312622070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.25592669543444674, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3887678034627187, + "kl": 0.020965576171875, + "learning_rate": 9.352580199594084e-07, + "loss": 0.0412, + "num_tokens": 722252747.0, + "reward": 1.3688616752624512, + "reward_std": 0.36884328722953796, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.21372579038143158, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 977.9063110351562, + "completions/mean_terminated_length": 748.8076171875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2561397901017527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.136920335796482, + "kl": 0.019012451171875, + "learning_rate": 9.350849189451e-07, + "loss": 0.0558, + "num_tokens": 722761137.0, + "reward": 1.575334906578064, + "reward_std": 0.3258565664291382, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969305515289307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.1958458423614502, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1108.140625, + "completions/mean_terminated_length": 875.1392822265625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.25635288476905865, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12358339108355751, + "kl": 0.01861572265625, + "learning_rate": 9.349116048157645e-07, + "loss": 0.0524, + "num_tokens": 723329440.0, + "reward": 1.3973214626312256, + "reward_std": 0.384210467338562, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1891935020685196, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1137.243408203125, + "completions/mean_terminated_length": 875.5316162109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2565659794363646, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12183898447172595, + "kl": 0.018310546875, + "learning_rate": 9.347380776673185e-07, + "loss": 0.1109, + "num_tokens": 723905261.0, + "reward": 1.3984376192092896, + "reward_std": 0.3694113492965698, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401999473572, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.21083606779575348, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 992.22998046875, + "completions/mean_terminated_length": 813.05224609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25677907410367057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13564871199570044, + "kl": 0.018035888671875, + "learning_rate": 9.345643375957955e-07, + "loss": 0.083, + "num_tokens": 724410628.0, + "reward": 1.4375001192092896, + "reward_std": 0.3085968494415283, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9129464030265808, + "rewards/tag_count_reward/std": 0.20942506194114685, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 956.19873046875, + "completions/mean_terminated_length": 780.8316040039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2569921687709765, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13642613669603001, + "kl": 0.02191162109375, + "learning_rate": 9.343903846973475e-07, + "loss": 0.0887, + "num_tokens": 724905117.0, + "reward": 1.4575893878936768, + "reward_std": 0.31205737590789795, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.16782118380069733, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 918.232177734375, + "completions/mean_terminated_length": 719.55908203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2572052634382825, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14826309943318305, + "kl": 0.021240234375, + "learning_rate": 9.342162190682442e-07, + "loss": 0.061, + "num_tokens": 725382805.0, + "reward": 1.5055804252624512, + "reward_std": 0.3464234471321106, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.18508079648017883, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 963.1741333007812, + "completions/mean_terminated_length": 792.180908203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2574183581055884, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12389017570976057, + "kl": 0.0206298828125, + "learning_rate": 9.34041840804873e-07, + "loss": 0.0751, + "num_tokens": 725883843.0, + "reward": 1.6411831378936768, + "reward_std": 0.3124764859676361, + "rewards/accuracy_reward/mean": 0.6964285969734192, + "rewards/accuracy_reward/std": 0.4603137671947479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.17976601421833038, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1035.649658203125, + "completions/mean_terminated_length": 818.9132690429688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.25763145277289434, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.22205956941423924, + "kl": 0.024169921875, + "learning_rate": 9.338672500037387e-07, + "loss": 0.104, + "num_tokens": 726424582.0, + "reward": 1.4380581378936768, + "reward_std": 0.29289552569389343, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.21241335570812225, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1056.3795166015625, + "completions/mean_terminated_length": 853.7903442382812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2578445474402003, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12472590543238136, + "kl": 0.0186767578125, + "learning_rate": 9.336924467614641e-07, + "loss": 0.0733, + "num_tokens": 726969856.0, + "reward": 1.4006696939468384, + "reward_std": 0.34661921858787537, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.20614159107208252, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1101.87060546875, + "completions/mean_terminated_length": 864.0167236328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25805764210750626, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1182586249231736, + "kl": 0.017425537109375, + "learning_rate": 9.335174311747893e-07, + "loss": 0.0584, + "num_tokens": 727536406.0, + "reward": 1.4760044813156128, + "reward_std": 0.3336218595504761, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.1743152141571045, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1129.779052734375, + "completions/mean_terminated_length": 908.4902954101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2582707367748122, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11041467398894227, + "kl": 0.018768310546875, + "learning_rate": 9.333422033405722e-07, + "loss": 0.0602, + "num_tokens": 728113155.0, + "reward": 1.4414063692092896, + "reward_std": 0.3210981488227844, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.22768031060695648, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1057.1920166015625, + "completions/mean_terminated_length": 854.768798828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2584838314421182, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13459729994573613, + "kl": 0.019073486328125, + "learning_rate": 9.331667633557877e-07, + "loss": 0.1247, + "num_tokens": 728656825.0, + "reward": 1.3705357313156128, + "reward_std": 0.34038785099983215, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8995535969734192, + "rewards/tag_count_reward/std": 0.2462465763092041, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1151.899658203125, + "completions/mean_terminated_length": 842.4354858398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25869692610942413, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12343725773168232, + "kl": 0.01708984375, + "learning_rate": 9.329911113175289e-07, + "loss": 0.0654, + "num_tokens": 729242588.0, + "reward": 1.3465402126312256, + "reward_std": 0.3305869996547699, + "rewards/accuracy_reward/mean": 0.45370370149612427, + "rewards/accuracy_reward/std": 0.49842923879623413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23580926656723022, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1143.0826416015625, + "completions/mean_terminated_length": 918.7437133789062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2589100207767301, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10932401625998242, + "kl": 0.0166473388671875, + "learning_rate": 9.328152473230052e-07, + "loss": 0.1034, + "num_tokens": 729825553.0, + "reward": 1.4380581378936768, + "reward_std": 0.30797672271728516, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.18520469963550568, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1006.9397583007812, + "completions/mean_terminated_length": 804.2799682617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.259123115444036, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1362992922962979, + "kl": 0.019622802734375, + "learning_rate": 9.326391714695443e-07, + "loss": 0.0845, + "num_tokens": 730350374.0, + "reward": 1.5117188692092896, + "reward_std": 0.34411683678627014, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19555239379405975, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1060.712158203125, + "completions/mean_terminated_length": 812.5111694335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.25933621011134195, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12419567343749208, + "kl": 0.0175018310546875, + "learning_rate": 9.324628838545905e-07, + "loss": 0.0888, + "num_tokens": 730890133.0, + "reward": 1.5161831378936768, + "reward_std": 0.4181612730026245, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.22583600878715515, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1004.6607666015625, + "completions/mean_terminated_length": 804.872314453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2595493047786479, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13049072762405026, + "kl": 0.02130126953125, + "learning_rate": 9.322863845757054e-07, + "loss": 0.0892, + "num_tokens": 731411229.0, + "reward": 1.3638393878936768, + "reward_std": 0.3830179274082184, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.22072072327136993, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1075.953125, + "completions/mean_terminated_length": 898.9841918945312, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.25976239944595386, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12467741413432065, + "kl": 0.018829345703125, + "learning_rate": 9.321096737305679e-07, + "loss": 0.0853, + "num_tokens": 731961240.0, + "reward": 1.5061384439468384, + "reward_std": 0.3604619801044464, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.21286541223526, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1115.8192138671875, + "completions/mean_terminated_length": 833.9970703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2599754941132598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8438915871257744, + "kl": 0.086090087890625, + "learning_rate": 9.319327514169742e-07, + "loss": 0.0851, + "num_tokens": 732530599.0, + "reward": 1.4224331378936768, + "reward_std": 0.39743557572364807, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8777901530265808, + "rewards/tag_count_reward/std": 0.2574244737625122, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1021.7098388671875, + "completions/mean_terminated_length": 838.0579223632812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2601885887805658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14065909553917508, + "kl": 0.0198974609375, + "learning_rate": 9.31755617732837e-07, + "loss": 0.0705, + "num_tokens": 733059701.0, + "reward": 1.493303656578064, + "reward_std": 0.3630177080631256, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19200991094112396, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1029.7410888671875, + "completions/mean_terminated_length": 824.9973754882812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.26040168344787173, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1350408112016386, + "kl": 0.0190582275390625, + "learning_rate": 9.315782727761861e-07, + "loss": 0.061, + "num_tokens": 733585457.0, + "reward": 1.4274554252624512, + "reward_std": 0.28504908084869385, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.20284497737884521, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1132.727783203125, + "completions/mean_terminated_length": 845.53076171875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2606147781151777, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13741809612190153, + "kl": 0.01788330078125, + "learning_rate": 9.314007166451688e-07, + "loss": 0.146, + "num_tokens": 734160903.0, + "reward": 1.2801339626312256, + "reward_std": 0.3947989046573639, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.25560733675956726, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1179.359375, + "completions/mean_terminated_length": 920.026123046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2608278727824836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12950320030316076, + "kl": 0.0167236328125, + "learning_rate": 9.312229494380485e-07, + "loss": 0.0808, + "num_tokens": 734755656.0, + "reward": 1.4525669813156128, + "reward_std": 0.38254234194755554, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9123883843421936, + "rewards/tag_count_reward/std": 0.22434963285923004, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1040.930908203125, + "completions/mean_terminated_length": 838.43701171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26104096744978955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13226573877139045, + "kl": 0.0194091796875, + "learning_rate": 9.310449712532058e-07, + "loss": 0.1033, + "num_tokens": 735293721.0, + "reward": 1.5407366752624512, + "reward_std": 0.30943822860717773, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.19310477375984192, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1183.575927734375, + "completions/mean_terminated_length": 981.1625366210938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2612540621170955, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11186234345853083, + "kl": 0.017730712890625, + "learning_rate": 9.308667821891381e-07, + "loss": 0.1347, + "num_tokens": 735895163.0, + "reward": 1.3789063692092896, + "reward_std": 0.4183768630027771, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8833705186843872, + "rewards/tag_count_reward/std": 0.26951199769973755, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1216.165283203125, + "completions/mean_terminated_length": 970.9421997070312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26146715678440147, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11737919478591904, + "kl": 0.016510009765625, + "learning_rate": 9.306883823444592e-07, + "loss": 0.076, + "num_tokens": 736516693.0, + "reward": 1.3666294813156128, + "reward_std": 0.3469386398792267, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.2239709198474884, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1147.203125, + "completions/mean_terminated_length": 927.0083618164062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2616802514517074, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10692977032712898, + "kl": 0.017364501953125, + "learning_rate": 9.305097718178999e-07, + "loss": 0.0519, + "num_tokens": 737102672.0, + "reward": 1.4408482313156128, + "reward_std": 0.3874766528606415, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.23455984890460968, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1013.044677734375, + "completions/mean_terminated_length": 827.8421630859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2618933461190134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12712121772950633, + "kl": 0.020782470703125, + "learning_rate": 9.303309507083074e-07, + "loss": 0.093, + "num_tokens": 737632948.0, + "reward": 1.4531251192092896, + "reward_std": 0.4119478166103363, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8950892686843872, + "rewards/tag_count_reward/std": 0.2374086230993271, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1068.3170166015625, + "completions/mean_terminated_length": 928.3622436523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26210644078631934, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11498952459920828, + "kl": 0.01800537109375, + "learning_rate": 9.301519191146457e-07, + "loss": 0.0481, + "num_tokens": 738180082.0, + "reward": 1.5513393878936768, + "reward_std": 0.3388901650905609, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.16445478796958923, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1116.1273193359375, + "completions/mean_terminated_length": 897.9201049804688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2623195354536253, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.22290054873954854, + "kl": 0.018402099609375, + "learning_rate": 9.299726771359947e-07, + "loss": 0.1197, + "num_tokens": 738749867.0, + "reward": 1.368303656578064, + "reward_std": 0.37348824739456177, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.23410436511039734, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 977.6250610351562, + "completions/mean_terminated_length": 808.9096069335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2625326301209312, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12189476131660924, + "kl": 0.02105712890625, + "learning_rate": 9.297932248715515e-07, + "loss": 0.0258, + "num_tokens": 739255379.0, + "reward": 1.5652902126312256, + "reward_std": 0.32277098298072815, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.1828029304742813, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1062.790283203125, + "completions/mean_terminated_length": 815.1116943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.26274572478823716, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1259967988638335, + "kl": 0.016937255859375, + "learning_rate": 9.29613562420629e-07, + "loss": -0.0015, + "num_tokens": 739803493.0, + "reward": 1.3922991752624512, + "reward_std": 0.279218852519989, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.19506020843982697, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1078.1763916015625, + "completions/mean_terminated_length": 886.2861328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2629588194555431, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12569743507603784, + "kl": 0.017303466796875, + "learning_rate": 9.294336898826566e-07, + "loss": 0.1075, + "num_tokens": 740356548.0, + "reward": 1.399553656578064, + "reward_std": 0.3404173254966736, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.2329067587852478, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1093.59375, + "completions/mean_terminated_length": 856.986083984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26317191412284907, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11675731763920931, + "kl": 0.017425537109375, + "learning_rate": 9.2925360735718e-07, + "loss": 0.0371, + "num_tokens": 740916094.0, + "reward": 1.4598214626312256, + "reward_std": 0.3263718783855438, + "rewards/accuracy_reward/mean": 0.5578703880310059, + "rewards/accuracy_reward/std": 0.49721553921699524, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.2142340987920761, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1127.372802734375, + "completions/mean_terminated_length": 886.1943359375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.26338500879015503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1101613500676768, + "kl": 0.016510009765625, + "learning_rate": 9.290733149438611e-07, + "loss": 0.0569, + "num_tokens": 741499461.0, + "reward": 1.4397321939468384, + "reward_std": 0.3753872811794281, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.20800147950649261, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1105.419677734375, + "completions/mean_terminated_length": 878.2603759765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.263598103457461, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12431351426306203, + "kl": 0.018768310546875, + "learning_rate": 9.288928127424781e-07, + "loss": 0.0796, + "num_tokens": 742063025.0, + "reward": 1.4765626192092896, + "reward_std": 0.37347933650016785, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9029017686843872, + "rewards/tag_count_reward/std": 0.22757954895496368, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1112.8460693359375, + "completions/mean_terminated_length": 881.0111083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.26381119812476694, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12892978323328838, + "kl": 0.018035888671875, + "learning_rate": 9.28712100852925e-07, + "loss": 0.0685, + "num_tokens": 742628908.0, + "reward": 1.4430804252624512, + "reward_std": 0.33087581396102905, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22092141211032867, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 950.43310546875, + "completions/mean_terminated_length": 806.30810546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2640242927920729, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14815803607912337, + "kl": 0.019927978515625, + "learning_rate": 9.285311793752119e-07, + "loss": 0.1113, + "num_tokens": 743123982.0, + "reward": 1.4609376192092896, + "reward_std": 0.4009788930416107, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.21947002410888672, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1087.321533203125, + "completions/mean_terminated_length": 845.81005859375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2642373874593788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1160957619740128, + "kl": 0.017608642578125, + "learning_rate": 9.283500484094652e-07, + "loss": 0.0478, + "num_tokens": 743685038.0, + "reward": 1.442522406578064, + "reward_std": 0.361387699842453, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.21535509824752808, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1077.97998046875, + "completions/mean_terminated_length": 820.4039306640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26445048212668476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1258940088707082, + "kl": 0.0165252685546875, + "learning_rate": 9.281687080559269e-07, + "loss": 0.1007, + "num_tokens": 744238037.0, + "reward": 1.3794643878936768, + "reward_std": 0.3933660089969635, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9084821343421936, + "rewards/tag_count_reward/std": 0.22805356979370117, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1093.341552734375, + "completions/mean_terminated_length": 846.6320190429688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.2646635767939907, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11364775455527655, + "kl": 0.01904296875, + "learning_rate": 9.27987158414955e-07, + "loss": 0.0435, + "num_tokens": 744795806.0, + "reward": 1.4045759439468384, + "reward_std": 0.27130869030952454, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.2348809540271759, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1062.997802734375, + "completions/mean_terminated_length": 855.3486938476562, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.2648766714612967, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.128938650186391, + "kl": 0.0169677734375, + "learning_rate": 9.278053995870235e-07, + "loss": 0.0871, + "num_tokens": 745346781.0, + "reward": 1.3934152126312256, + "reward_std": 0.3504249155521393, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.19479762017726898, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1062.3929443359375, + "completions/mean_terminated_length": 800.677978515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26508976612860263, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1273156715140467, + "kl": 0.017120361328125, + "learning_rate": 9.276234316727217e-07, + "loss": 0.0741, + "num_tokens": 745889469.0, + "reward": 1.3649554252624512, + "reward_std": 0.3606833815574646, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.22482003271579742, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1043.7366943359375, + "completions/mean_terminated_length": 845.1925048828125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2653028607959086, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13099118791818068, + "kl": 0.01849365234375, + "learning_rate": 9.274412547727552e-07, + "loss": 0.0856, + "num_tokens": 746437239.0, + "reward": 1.5072544813156128, + "reward_std": 0.32017356157302856, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.21763859689235687, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1022.4732666015625, + "completions/mean_terminated_length": 782.3361206054688, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.26551595546321455, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.41915369366731653, + "kl": 0.021942138671875, + "learning_rate": 9.272588689879447e-07, + "loss": 0.0766, + "num_tokens": 746972219.0, + "reward": 1.364397406578064, + "reward_std": 0.27661991119384766, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16235284507274628, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1033.7410888671875, + "completions/mean_terminated_length": 836.2986450195312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.2657290501305205, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12305629804651923, + "kl": 0.019500732421875, + "learning_rate": 9.270762744192271e-07, + "loss": 0.0749, + "num_tokens": 747497351.0, + "reward": 1.5256696939468384, + "reward_std": 0.2930222451686859, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4846842288970947, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.20802249014377594, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1079.3304443359375, + "completions/mean_terminated_length": 893.8403930664062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.2659421447978264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12347060711186815, + "kl": 0.019287109375, + "learning_rate": 9.268934711676543e-07, + "loss": 0.0716, + "num_tokens": 748046443.0, + "reward": 1.3627232313156128, + "reward_std": 0.3684873878955841, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.2357490509748459, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1047.8348388671875, + "completions/mean_terminated_length": 865.7467651367188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.26615523946513236, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1294320809294041, + "kl": 0.0160675048828125, + "learning_rate": 9.267104593343938e-07, + "loss": 0.1051, + "num_tokens": 748585777.0, + "reward": 1.3264509439468384, + "reward_std": 0.3352195918560028, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.18351179361343384, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1025.5982666015625, + "completions/mean_terminated_length": 867.4948120117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2663683341324383, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.34941743444332113, + "kl": 0.020477294921875, + "learning_rate": 9.265272390207289e-07, + "loss": 0.091, + "num_tokens": 749121021.0, + "reward": 1.4090402126312256, + "reward_std": 0.304977148771286, + "rewards/accuracy_reward/mean": 0.4930555522441864, + "rewards/accuracy_reward/std": 0.5005313754081726, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.19407851994037628, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1050.649658203125, + "completions/mean_terminated_length": 878.3324584960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2665814287997443, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1414529674591582, + "kl": 0.019805908203125, + "learning_rate": 9.263438103280579e-07, + "loss": 0.1249, + "num_tokens": 749661584.0, + "reward": 1.4810268878936768, + "reward_std": 0.37198108434677124, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.19511540234088898, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 994.32373046875, + "completions/mean_terminated_length": 815.5013427734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26679452346705024, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13613454586536913, + "kl": 0.020843505859375, + "learning_rate": 9.261601733578945e-07, + "loss": 0.1247, + "num_tokens": 750170801.0, + "reward": 1.4810268878936768, + "reward_std": 0.2909072935581207, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.1993686854839325, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 956.24560546875, + "completions/mean_terminated_length": 754.0687866210938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2670076181343562, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14141598231871455, + "kl": 0.019561767578125, + "learning_rate": 9.259763282118678e-07, + "loss": 0.1098, + "num_tokens": 750673199.0, + "reward": 1.4681919813156128, + "reward_std": 0.34307199716567993, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.2075621485710144, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1048.075927734375, + "completions/mean_terminated_length": 866.0316772460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26722071280166215, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13368042300137775, + "kl": 0.0194091796875, + "learning_rate": 9.25792274991722e-07, + "loss": 0.1038, + "num_tokens": 751205697.0, + "reward": 1.575334906578064, + "reward_std": 0.3807145059108734, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.20061947405338287, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 961.26123046875, + "completions/mean_terminated_length": 753.1622314453125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2674338074689681, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1355603832274258, + "kl": 0.0194091796875, + "learning_rate": 9.256080137993164e-07, + "loss": 0.0567, + "num_tokens": 751703558.0, + "reward": 1.5446429252624512, + "reward_std": 0.29710811376571655, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.1737278550863266, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1038.8460693359375, + "completions/mean_terminated_length": 792.1638793945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26764690213627407, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12704748606158775, + "kl": 0.017120361328125, + "learning_rate": 9.254235447366254e-07, + "loss": 0.0891, + "num_tokens": 752235729.0, + "reward": 1.4330357313156128, + "reward_std": 0.35093873739242554, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.21488575637340546, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 976.8460083007812, + "completions/mean_terminated_length": 854.276123046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26785999680357997, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13369863209014135, + "kl": 0.021636962890625, + "learning_rate": 9.252388679057388e-07, + "loss": 0.0251, + "num_tokens": 752739420.0, + "reward": 1.4804688692092896, + "reward_std": 0.27190059423446655, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.15670275688171387, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1100.94873046875, + "completions/mean_terminated_length": 839.2279052734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2680730914708859, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1425194209591029, + "kl": 0.018280029296875, + "learning_rate": 9.250539834088608e-07, + "loss": 0.1103, + "num_tokens": 753301413.0, + "reward": 1.3359376192092896, + "reward_std": 0.2884243428707123, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9341517686843872, + "rewards/tag_count_reward/std": 0.18353645503520966, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1012.58935546875, + "completions/mean_terminated_length": 790.916015625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2682861861381919, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12886905502422835, + "kl": 0.0189208984375, + "learning_rate": 9.248688913483109e-07, + "loss": 0.0602, + "num_tokens": 753827917.0, + "reward": 1.4877232313156128, + "reward_std": 0.3074907958507538, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9095982313156128, + "rewards/tag_count_reward/std": 0.23513396084308624, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1021.4085083007812, + "completions/mean_terminated_length": 865.704345703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.26849928080549784, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11814261975111655, + "kl": 0.0203857421875, + "learning_rate": 9.246835918265235e-07, + "loss": 0.0556, + "num_tokens": 754353108.0, + "reward": 1.4642857313156128, + "reward_std": 0.28475141525268555, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.15519075095653534, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1044.2054443359375, + "completions/mean_terminated_length": 832.5946044921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2687123754728038, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14014850618227453, + "kl": 0.020599365234375, + "learning_rate": 9.244980849460475e-07, + "loss": 0.1386, + "num_tokens": 754891408.0, + "reward": 1.4146206378936768, + "reward_std": 0.3818550705909729, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9079241156578064, + "rewards/tag_count_reward/std": 0.23655983805656433, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1078.78125, + "completions/mean_terminated_length": 800.2701416015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26892547014010976, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14095879539279002, + "kl": 0.016998291015625, + "learning_rate": 9.243123708095469e-07, + "loss": 0.0881, + "num_tokens": 755447294.0, + "reward": 1.3599331378936768, + "reward_std": 0.28407299518585205, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.2073153853416443, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1074.3638916015625, + "completions/mean_terminated_length": 822.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2691385648074157, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13144982862168636, + "kl": 0.01800537109375, + "learning_rate": 9.241264495198003e-07, + "loss": 0.0632, + "num_tokens": 756001361.0, + "reward": 1.2723214626312256, + "reward_std": 0.29284587502479553, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211319923401, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.22655968368053436, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1111.5826416015625, + "completions/mean_terminated_length": 879.4345092773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26935165947472167, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12932675287384024, + "kl": 0.019195556640625, + "learning_rate": 9.239403211797007e-07, + "loss": 0.0883, + "num_tokens": 756566182.0, + "reward": 1.3911831378936768, + "reward_std": 0.4241245985031128, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.22101959586143494, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1078.65185546875, + "completions/mean_terminated_length": 871.1219482421875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.2695647541420276, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13351338370829988, + "kl": 0.01739501953125, + "learning_rate": 9.23753985892256e-07, + "loss": 0.1377, + "num_tokens": 757119658.0, + "reward": 1.4068081378936768, + "reward_std": 0.3259485363960266, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21524494886398315, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 969.27685546875, + "completions/mean_terminated_length": 782.9005126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.26977784880933353, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1445523923289593, + "kl": 0.019287109375, + "learning_rate": 9.235674437605887e-07, + "loss": 0.1158, + "num_tokens": 757624134.0, + "reward": 1.5546876192092896, + "reward_std": 0.3644367754459381, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18195155262947083, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1012.3772583007812, + "completions/mean_terminated_length": 830.2598266601562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2699909434766395, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11426376389846563, + "kl": 0.019683837890625, + "learning_rate": 9.233806948879354e-07, + "loss": 0.0975, + "num_tokens": 758142479.0, + "reward": 1.4704241752624512, + "reward_std": 0.3003813326358795, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19155997037887573, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1028.984375, + "completions/mean_terminated_length": 859.1484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.27020403814394545, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12201104481683449, + "kl": 0.019073486328125, + "learning_rate": 9.231937393776474e-07, + "loss": 0.1039, + "num_tokens": 758668104.0, + "reward": 1.5295759439468384, + "reward_std": 0.36585596203804016, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.17343208193778992, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1036.5826416015625, + "completions/mean_terminated_length": 846.1034545898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2704171328112514, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.26527561567332636, + "kl": 0.023193359375, + "learning_rate": 9.2300657733319e-07, + "loss": 0.0225, + "num_tokens": 759193869.0, + "reward": 1.4481027126312256, + "reward_std": 0.3217213749885559, + "rewards/accuracy_reward/mean": 0.5393518805503845, + "rewards/accuracy_reward/std": 0.49902695417404175, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.1949641853570938, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1002.1183471679688, + "completions/mean_terminated_length": 824.6188354492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27063022747855736, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12381739915452857, + "kl": 0.01800537109375, + "learning_rate": 9.228192088581434e-07, + "loss": 0.0545, + "num_tokens": 759722882.0, + "reward": 1.4877232313156128, + "reward_std": 0.359967440366745, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.1804911494255066, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1030.12060546875, + "completions/mean_terminated_length": 784.8143920898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2708433221458633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3405075545297052, + "kl": 0.07879638671875, + "learning_rate": 9.226316340562015e-07, + "loss": 0.0541, + "num_tokens": 760257608.0, + "reward": 1.5474331378936768, + "reward_std": 0.37388885021209717, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.19193758070468903, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 997.607177734375, + "completions/mean_terminated_length": 825.7246704101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2710564168131693, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11600665764022257, + "kl": 0.018524169921875, + "learning_rate": 9.224438530311727e-07, + "loss": 0.0999, + "num_tokens": 760775608.0, + "reward": 1.5044643878936768, + "reward_std": 0.30424150824546814, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.1718057543039322, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1039.1273193359375, + "completions/mean_terminated_length": 809.7123413085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2712695114804752, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1343517579353277, + "kl": 0.022735595703125, + "learning_rate": 9.222558658869794e-07, + "loss": 0.0575, + "num_tokens": 761313473.0, + "reward": 1.4341518878936768, + "reward_std": 0.33831170201301575, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.20031820237636566, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1163.9420166015625, + "completions/mean_terminated_length": 896.6685791015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.27148260614778114, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10241693365593539, + "kl": 0.018218994140625, + "learning_rate": 9.22067672727658e-07, + "loss": 0.0514, + "num_tokens": 761903671.0, + "reward": 1.4335938692092896, + "reward_std": 0.34236106276512146, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.1882467120885849, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1054.919677734375, + "completions/mean_terminated_length": 892.4155883789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2716957008150871, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12789871144584183, + "kl": 0.018096923828125, + "learning_rate": 9.218792736573592e-07, + "loss": 0.0549, + "num_tokens": 762454003.0, + "reward": 1.5496652126312256, + "reward_std": 0.3572940528392792, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.18043838441371918, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1038.259033203125, + "completions/mean_terminated_length": 838.4705810546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27190879548239305, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1346422106752444, + "kl": 0.02032470703125, + "learning_rate": 9.216906687803475e-07, + "loss": 0.1228, + "num_tokens": 762990359.0, + "reward": 1.3476563692092896, + "reward_std": 0.3108026087284088, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17351123690605164, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1064.58935546875, + "completions/mean_terminated_length": 803.4576416015625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.272121890149699, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13124513376721517, + "kl": 0.018585205078125, + "learning_rate": 9.215018582010008e-07, + "loss": 0.0352, + "num_tokens": 763541215.0, + "reward": 1.3303571939468384, + "reward_std": 0.3550775647163391, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19285328686237335, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1048.43310546875, + "completions/mean_terminated_length": 824.486328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27233498481700497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1229452916585429, + "kl": 0.01983642578125, + "learning_rate": 9.213128420238119e-07, + "loss": 0.0637, + "num_tokens": 764082129.0, + "reward": 1.5044643878936768, + "reward_std": 0.31275925040245056, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19418221712112427, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1047.3817138671875, + "completions/mean_terminated_length": 833.1572265625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2725480794843109, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15425825221339634, + "kl": 0.019927978515625, + "learning_rate": 9.211236203533864e-07, + "loss": 0.0787, + "num_tokens": 764619340.0, + "reward": 1.446428656578064, + "reward_std": 0.3318825960159302, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19710472226142883, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 1034.4241943359375, + "completions/mean_terminated_length": 843.5384521484375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2727611741516169, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12728788961953486, + "kl": 0.0191650390625, + "learning_rate": 9.209341932944441e-07, + "loss": 0.1464, + "num_tokens": 765151098.0, + "reward": 1.4441964626312256, + "reward_std": 0.2830570936203003, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.208564892411232, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1081.7523193359375, + "completions/mean_terminated_length": 881.210205078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2729742688189228, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12452789166596236, + "kl": 0.02044677734375, + "learning_rate": 9.207445609518185e-07, + "loss": 0.1034, + "num_tokens": 765710651.0, + "reward": 1.4693081378936768, + "reward_std": 0.3502883017063141, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.18375654518604279, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1040.3616943359375, + "completions/mean_terminated_length": 840.9893188476562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.27318736348622874, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13767967117707908, + "kl": 0.01983642578125, + "learning_rate": 9.205547234304563e-07, + "loss": 0.1237, + "num_tokens": 766247853.0, + "reward": 1.4977679252624512, + "reward_std": 0.4095991253852844, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9129464030265808, + "rewards/tag_count_reward/std": 0.23101231455802917, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1080.1116943359375, + "completions/mean_terminated_length": 819.6317138671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2734004581535347, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1527045496459597, + "kl": 0.01776123046875, + "learning_rate": 9.203646808354185e-07, + "loss": 0.0951, + "num_tokens": 766814543.0, + "reward": 1.4827009439468384, + "reward_std": 0.37107545137405396, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.23668117821216583, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1110.21435546875, + "completions/mean_terminated_length": 915.5794677734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27361355282084066, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12547394198569417, + "kl": 0.021728515625, + "learning_rate": 9.201744332718787e-07, + "loss": 0.0363, + "num_tokens": 767378655.0, + "reward": 1.4732143878936768, + "reward_std": 0.32989761233329773, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.22120662033557892, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1116.5491943359375, + "completions/mean_terminated_length": 875.8370971679688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2738266474881466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1322886581869743, + "kl": 0.017669677734375, + "learning_rate": 9.199839808451244e-07, + "loss": 0.0818, + "num_tokens": 767948501.0, + "reward": 1.4029018878936768, + "reward_std": 0.3874658942222595, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.21812346577644348, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1135.3773193359375, + "completions/mean_terminated_length": 866.338134765625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.27403974215545257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13280638447083, + "kl": 0.0177001953125, + "learning_rate": 9.197933236605568e-07, + "loss": 0.0949, + "num_tokens": 768526990.0, + "reward": 1.4185268878936768, + "reward_std": 0.3754233419895172, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.24003124237060547, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1002.4397583007812, + "completions/mean_terminated_length": 802.2260131835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27425283682275853, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14771938410677754, + "kl": 0.019134521484375, + "learning_rate": 9.196024618236898e-07, + "loss": 0.1237, + "num_tokens": 769038755.0, + "reward": 1.5161831378936768, + "reward_std": 0.3286365568637848, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.20777854323387146, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1004.9866333007812, + "completions/mean_terminated_length": 821.569580078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2744659314900645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.126777309169689, + "kl": 0.019775390625, + "learning_rate": 9.194113954401507e-07, + "loss": 0.0457, + "num_tokens": 769556573.0, + "reward": 1.4771206378936768, + "reward_std": 0.36081451177597046, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.22157806158065796, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1101.868408203125, + "completions/mean_terminated_length": 886.7205810546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2746790261573704, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13175449661959177, + "kl": 0.019561767578125, + "learning_rate": 9.192201246156804e-07, + "loss": 0.0822, + "num_tokens": 770113426.0, + "reward": 1.3945313692092896, + "reward_std": 0.3666723966598511, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9012276530265808, + "rewards/tag_count_reward/std": 0.236807718873024, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1077.90625, + "completions/mean_terminated_length": 857.3096313476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27489212082467634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13093331823566642, + "kl": 0.02020263671875, + "learning_rate": 9.190286494561324e-07, + "loss": 0.065, + "num_tokens": 770665448.0, + "reward": 1.4564732313156128, + "reward_std": 0.3912244737148285, + "rewards/accuracy_reward/mean": 0.5694444179534912, + "rewards/accuracy_reward/std": 0.495728075504303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.23004567623138428, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1046.16748046875, + "completions/mean_terminated_length": 854.3270874023438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2751052154919823, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13619036697238843, + "kl": 0.02032470703125, + "learning_rate": 9.188369700674735e-07, + "loss": 0.0764, + "num_tokens": 771199907.0, + "reward": 1.5212054252624512, + "reward_std": 0.35409578680992126, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2190144956111908, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1152.046875, + "completions/mean_terminated_length": 881.1773071289062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27531831015928826, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1342892072882594, + "kl": 0.017669677734375, + "learning_rate": 9.18645086555784e-07, + "loss": 0.0817, + "num_tokens": 771784248.0, + "reward": 1.4023438692092896, + "reward_std": 0.34993189573287964, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.22639364004135132, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1126.3192138671875, + "completions/mean_terminated_length": 913.6236572265625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2755314048265942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12335500836179189, + "kl": 0.017242431640625, + "learning_rate": 9.184529990272564e-07, + "loss": 0.0463, + "num_tokens": 772358951.0, + "reward": 1.4380581378936768, + "reward_std": 0.3645114302635193, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19688211381435394, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1151.0201416015625, + "completions/mean_terminated_length": 848.4567260742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2757444994939002, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1339283393085002, + "kl": 0.018035888671875, + "learning_rate": 9.182607075881966e-07, + "loss": 0.1464, + "num_tokens": 772947328.0, + "reward": 1.3710938692092896, + "reward_std": 0.3301179111003876, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8978794813156128, + "rewards/tag_count_reward/std": 0.2377442717552185, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1066.6160888671875, + "completions/mean_terminated_length": 850.016357421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27595759416120613, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11260364875495107, + "kl": 0.018035888671875, + "learning_rate": 9.180682123450229e-07, + "loss": 0.0765, + "num_tokens": 773499780.0, + "reward": 1.3777902126312256, + "reward_std": 0.3194967806339264, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.20309332013130188, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1103.009033203125, + "completions/mean_terminated_length": 852.0791015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2761706888285121, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12013454198141268, + "kl": 0.016998291015625, + "learning_rate": 9.178755134042671e-07, + "loss": 0.0907, + "num_tokens": 774066232.0, + "reward": 1.4285714626312256, + "reward_std": 0.35301855206489563, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.20529504120349884, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1060.26123046875, + "completions/mean_terminated_length": 838.9644775390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.276383783495818, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11987522861710993, + "kl": 0.017822265625, + "learning_rate": 9.176826108725728e-07, + "loss": 0.0608, + "num_tokens": 774608781.0, + "reward": 1.4531251192092896, + "reward_std": 0.32021352648735046, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.21881206333637238, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1005.0022583007812, + "completions/mean_terminated_length": 834.329833984375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.27659687816312395, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14466275889324853, + "kl": 0.020233154296875, + "learning_rate": 9.174895048566973e-07, + "loss": 0.1145, + "num_tokens": 775126510.0, + "reward": 1.5039063692092896, + "reward_std": 0.2775116264820099, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18025840818881989, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 893.5067138671875, + "completions/mean_terminated_length": 748.4698486328125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2768099728304299, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1311021620055237, + "kl": 0.0228271484375, + "learning_rate": 9.172961954635099e-07, + "loss": 0.0903, + "num_tokens": 775593969.0, + "reward": 1.4994419813156128, + "reward_std": 0.3223113417625427, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.19004856050014496, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1022.2969360351562, + "completions/mean_terminated_length": 816.0563354492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.27702306749773586, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13095663745198166, + "kl": 0.017852783203125, + "learning_rate": 9.171026827999922e-07, + "loss": 0.0512, + "num_tokens": 776119414.0, + "reward": 1.4274554252624512, + "reward_std": 0.28089243173599243, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.15616509318351746, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1020.169677734375, + "completions/mean_terminated_length": 728.6074829101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2772361621650418, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10903291861117369, + "kl": 0.019256591796875, + "learning_rate": 9.16908966973239e-07, + "loss": 0.0765, + "num_tokens": 776649762.0, + "reward": 1.4313616752624512, + "reward_std": 0.3040112853050232, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.1997338831424713, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1112.2879638671875, + "completions/mean_terminated_length": 825.8455200195312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2774492568323478, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12266434006344622, + "kl": 0.0175018310546875, + "learning_rate": 9.167150480904571e-07, + "loss": 0.1108, + "num_tokens": 777220851.0, + "reward": 1.4754464626312256, + "reward_std": 0.3249458372592926, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18152911961078644, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1051.6875, + "completions/mean_terminated_length": 838.3848266601562, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.27766235149965374, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12180991859009188, + "kl": 0.016845703125, + "learning_rate": 9.165209262589656e-07, + "loss": 0.0644, + "num_tokens": 777767719.0, + "reward": 1.3811384439468384, + "reward_std": 0.3671019971370697, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.1848943680524826, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 975.72998046875, + "completions/mean_terminated_length": 790.4685668945312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.2778754461669597, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13799097346756864, + "kl": 0.02154541015625, + "learning_rate": 9.163266015861963e-07, + "loss": 0.061, + "num_tokens": 778272014.0, + "reward": 1.5267857313156128, + "reward_std": 0.32575759291648865, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19273674488067627, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1047.388427734375, + "completions/mean_terminated_length": 829.8641357421875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2780885408342656, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12117315708354938, + "kl": 0.01800537109375, + "learning_rate": 9.16132074179693e-07, + "loss": 0.0829, + "num_tokens": 778813820.0, + "reward": 1.583147406578064, + "reward_std": 0.34535449743270874, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.19325986504554749, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 997.2076416015625, + "completions/mean_terminated_length": 789.2968139648438, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.27830163550157155, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11696184323983488, + "kl": 0.0206298828125, + "learning_rate": 9.159373441471116e-07, + "loss": 0.1028, + "num_tokens": 779331481.0, + "reward": 1.5647321939468384, + "reward_std": 0.3234870433807373, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.17605489492416382, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1053.6273193359375, + "completions/mean_terminated_length": 834.1607666015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2785147301688775, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11333998857801916, + "kl": 0.0174713134765625, + "learning_rate": 9.157424115962202e-07, + "loss": 0.0456, + "num_tokens": 779870594.0, + "reward": 1.4994419813156128, + "reward_std": 0.29483819007873535, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.20140819251537323, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1079.087158203125, + "completions/mean_terminated_length": 814.8380737304688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.27872782483618347, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1164108918532394, + "kl": 0.017669677734375, + "learning_rate": 9.155472766348993e-07, + "loss": 0.0557, + "num_tokens": 780423801.0, + "reward": 1.4369419813156128, + "reward_std": 0.35016340017318726, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.18019606173038483, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1081.5535888671875, + "completions/mean_terminated_length": 877.8162231445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2789409195034894, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11959882336942596, + "kl": 0.0157318115234375, + "learning_rate": 9.15351939371141e-07, + "loss": 0.1083, + "num_tokens": 780980753.0, + "reward": 1.5412946939468384, + "reward_std": 0.35582858324050903, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18730680644512177, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1056.868408203125, + "completions/mean_terminated_length": 844.6748046875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.2791540141707954, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.4947473902679029, + "kl": 0.05352783203125, + "learning_rate": 9.151563999130496e-07, + "loss": 0.0708, + "num_tokens": 781523846.0, + "reward": 1.5541294813156128, + "reward_std": 0.340303510427475, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19121423363685608, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1092.430908203125, + "completions/mean_terminated_length": 852.203857421875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.27936710883810134, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12817383387377276, + "kl": 0.0189208984375, + "learning_rate": 9.149606583688413e-07, + "loss": 0.1018, + "num_tokens": 782076535.0, + "reward": 1.4564732313156128, + "reward_std": 0.36992743611335754, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.23239968717098236, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1121.919677734375, + "completions/mean_terminated_length": 882.5955200195312, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.2795802035054073, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11178071436718626, + "kl": 0.0167388916015625, + "learning_rate": 9.147647148468437e-07, + "loss": 0.0398, + "num_tokens": 782646675.0, + "reward": 1.317522406578064, + "reward_std": 0.2902761995792389, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16430194675922394, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 933.4777221679688, + "completions/mean_terminated_length": 764.43701171875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2797932981727132, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1339476684640531, + "kl": 0.021026611328125, + "learning_rate": 9.14568569455497e-07, + "loss": 0.0504, + "num_tokens": 783136857.0, + "reward": 1.4927456378936768, + "reward_std": 0.300567626953125, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.13225844502449036, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1056.1607666015625, + "completions/mean_terminated_length": 869.3687133789062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.28000639284001916, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1283274046980892, + "kl": 0.019195556640625, + "learning_rate": 9.143722223033523e-07, + "loss": 0.0678, + "num_tokens": 783676529.0, + "reward": 1.469866156578064, + "reward_std": 0.3264045715332031, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.19393454492092133, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1124.7254638671875, + "completions/mean_terminated_length": 882.853515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2802194875073251, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11825637178520826, + "kl": 0.0162811279296875, + "learning_rate": 9.14175673499073e-07, + "loss": 0.0313, + "num_tokens": 784250262.0, + "reward": 1.3911831378936768, + "reward_std": 0.2963281571865082, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.201184943318367, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 943.0245971679688, + "completions/mean_terminated_length": 765.5414428710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2804325821746311, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12996510470112538, + "kl": 0.02069091796875, + "learning_rate": 9.139789231514335e-07, + "loss": 0.0444, + "num_tokens": 784740193.0, + "reward": 1.4626116752624512, + "reward_std": 0.3077471852302551, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.18222151696681976, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 948.5938110351562, + "completions/mean_terminated_length": 778.5824584960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28064567684193703, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11537986731300234, + "kl": 0.019744873046875, + "learning_rate": 9.137819713693204e-07, + "loss": 0.0437, + "num_tokens": 785232331.0, + "reward": 1.5669643878936768, + "reward_std": 0.3538995683193207, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1838252991437912, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1119.7076416015625, + "completions/mean_terminated_length": 869.8838500976562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.280858771509243, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14324813712547055, + "kl": 0.0158843994140625, + "learning_rate": 9.135848182617314e-07, + "loss": 0.0829, + "num_tokens": 785807720.0, + "reward": 1.3844866752624512, + "reward_std": 0.30960574746131897, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23282569646835327, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1061.5201416015625, + "completions/mean_terminated_length": 866.334228515625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.28107186617654895, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11645060112077799, + "kl": 0.017913818359375, + "learning_rate": 9.133874639377753e-07, + "loss": 0.0662, + "num_tokens": 786358817.0, + "reward": 1.473772406578064, + "reward_std": 0.2898625433444977, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.1829940527677536, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1057.212158203125, + "completions/mean_terminated_length": 811.5849609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2812849608438549, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12767703266349847, + "kl": 0.01812744140625, + "learning_rate": 9.13189908506673e-07, + "loss": 0.0503, + "num_tokens": 786899984.0, + "reward": 1.3063616752624512, + "reward_std": 0.33003297448158264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19617067277431488, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1070.49560546875, + "completions/mean_terminated_length": 793.209228515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28149805551116086, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13722848041616018, + "kl": 0.0153656005859375, + "learning_rate": 9.12992152077756e-07, + "loss": 0.0861, + "num_tokens": 787447246.0, + "reward": 1.5094866752624512, + "reward_std": 0.3423069715499878, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21863441169261932, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1105.0491943359375, + "completions/mean_terminated_length": 903.1707763671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28171115017846676, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11767120535812368, + "kl": 0.01885986328125, + "learning_rate": 9.127941947604676e-07, + "loss": 0.0646, + "num_tokens": 788009348.0, + "reward": 1.5563616752624512, + "reward_std": 0.3679960370063782, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18297357857227325, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1181.185302734375, + "completions/mean_terminated_length": 905.8441162109375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2819242448457727, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11667056771683709, + "kl": 0.018402099609375, + "learning_rate": 9.12596036664362e-07, + "loss": 0.1007, + "num_tokens": 788600551.0, + "reward": 1.3309152126312256, + "reward_std": 0.3525812327861786, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.2226571887731552, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1099.384033203125, + "completions/mean_terminated_length": 886.8524169921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2821373395130787, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11243789580256533, + "kl": 0.0184326171875, + "learning_rate": 9.123976778991045e-07, + "loss": 0.0553, + "num_tokens": 789166339.0, + "reward": 1.4453126192092896, + "reward_std": 0.4101276695728302, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.2030172199010849, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1133.732177734375, + "completions/mean_terminated_length": 821.6766967773438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.28235043418038464, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13040762072720444, + "kl": 0.0167694091796875, + "learning_rate": 9.121991185744713e-07, + "loss": 0.1286, + "num_tokens": 789744491.0, + "reward": 1.3755581378936768, + "reward_std": 0.34969362616539, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.22809666395187378, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1135.482177734375, + "completions/mean_terminated_length": 886.6136474609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2825635288476906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12475266511159067, + "kl": 0.017303466796875, + "learning_rate": 9.120003588003499e-07, + "loss": 0.0992, + "num_tokens": 790326051.0, + "reward": 1.4079241752624512, + "reward_std": 0.418332040309906, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.22078222036361694, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1067.5357666015625, + "completions/mean_terminated_length": 837.950439453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28277662351499655, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13121360041275573, + "kl": 0.01800537109375, + "learning_rate": 9.118013986867389e-07, + "loss": 0.0454, + "num_tokens": 790880259.0, + "reward": 1.4335938692092896, + "reward_std": 0.39410674571990967, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21280090510845184, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1007.6964721679688, + "completions/mean_terminated_length": 837.4649047851562, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2829897181823025, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13209376764465297, + "kl": 0.01959228515625, + "learning_rate": 9.116022383437472e-07, + "loss": 0.0951, + "num_tokens": 791399947.0, + "reward": 1.5345982313156128, + "reward_std": 0.36471685767173767, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.19029554724693298, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1021.5982666015625, + "completions/mean_terminated_length": 831.5238037109375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.28320281284960847, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12199474478311033, + "kl": 0.01953125, + "learning_rate": 9.114028778815947e-07, + "loss": 0.0781, + "num_tokens": 791924935.0, + "reward": 1.5295759439468384, + "reward_std": 0.30135899782180786, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.17181210219860077, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1115.4888916015625, + "completions/mean_terminated_length": 900.2940063476562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.28341590751691437, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1286730240094993, + "kl": 0.01953125, + "learning_rate": 9.112033174106124e-07, + "loss": 0.1351, + "num_tokens": 792492258.0, + "reward": 1.4414063692092896, + "reward_std": 0.41261225938796997, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.22844666242599487, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 933.3214721679688, + "completions/mean_terminated_length": 767.5487670898438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2836290021842203, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14463917669042317, + "kl": 0.020843505859375, + "learning_rate": 9.110035570412417e-07, + "loss": 0.0711, + "num_tokens": 792985346.0, + "reward": 1.4760044813156128, + "reward_std": 0.3053831160068512, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.15989671647548676, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1035.591552734375, + "completions/mean_terminated_length": 812.1444091796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2838420968515263, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13670847931225974, + "kl": 0.02178955078125, + "learning_rate": 9.108035968840348e-07, + "loss": 0.1155, + "num_tokens": 793516571.0, + "reward": 1.4888393878936768, + "reward_std": 0.3700525462627411, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.22866584360599518, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1162.2076416015625, + "completions/mean_terminated_length": 891.0466918945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28405519151883224, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12082535966206552, + "kl": 0.0195770263671875, + "learning_rate": 9.10603437049654e-07, + "loss": 0.1001, + "num_tokens": 794106872.0, + "reward": 1.3783482313156128, + "reward_std": 0.3501928150653839, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9185267686843872, + "rewards/tag_count_reward/std": 0.21232296526432037, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1039.462158203125, + "completions/mean_terminated_length": 871.3724365234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2842682861861382, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1430705814576282, + "kl": 0.019134521484375, + "learning_rate": 9.104030776488727e-07, + "loss": 0.1336, + "num_tokens": 794636743.0, + "reward": 1.5133929252624512, + "reward_std": 0.34940335154533386, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.2073281854391098, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1213.9732666015625, + "completions/mean_terminated_length": 932.644775390625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.28448138085344415, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11293167541330806, + "kl": 0.016754150390625, + "learning_rate": 9.102025187925742e-07, + "loss": 0.0494, + "num_tokens": 795251147.0, + "reward": 1.301897406578064, + "reward_std": 0.348479300737381, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.22180895507335663, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1067.22998046875, + "completions/mean_terminated_length": 824.0863647460938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2846944755207501, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12420104375672326, + "kl": 0.021087646484375, + "learning_rate": 9.10001760591753e-07, + "loss": 0.0662, + "num_tokens": 795799826.0, + "reward": 1.5239956378936768, + "reward_std": 0.3567754924297333, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.18406878411769867, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1122.009033203125, + "completions/mean_terminated_length": 914.5464477539062, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.28490757018805607, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11326181532397314, + "kl": 0.01788330078125, + "learning_rate": 9.098008031575131e-07, + "loss": 0.0757, + "num_tokens": 796370134.0, + "reward": 1.4095982313156128, + "reward_std": 0.3356040120124817, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.20478054881095886, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1005.8660888671875, + "completions/mean_terminated_length": 765.3736572265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28512066485536197, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13537729933950773, + "kl": 0.021392822265625, + "learning_rate": 9.095996466010689e-07, + "loss": 0.0988, + "num_tokens": 796885002.0, + "reward": 1.5161831378936768, + "reward_std": 0.3343140780925751, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.22180894017219543, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1077.399658203125, + "completions/mean_terminated_length": 863.1798095703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28533375952266793, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1352687549697869, + "kl": 0.0177001953125, + "learning_rate": 9.093982910337454e-07, + "loss": 0.0957, + "num_tokens": 797434525.0, + "reward": 1.4469866752624512, + "reward_std": 0.3665366768836975, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.20797671377658844, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1164.977783203125, + "completions/mean_terminated_length": 920.9515380859375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2855468541899739, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11662790671178405, + "kl": 0.016326904296875, + "learning_rate": 9.091967365669774e-07, + "loss": 0.0321, + "num_tokens": 798024867.0, + "reward": 1.4274554252624512, + "reward_std": 0.3158789873123169, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.17893508076667786, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1116.8773193359375, + "completions/mean_terminated_length": 866.291748046875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.28575994885727984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12843343009270597, + "kl": 0.0170440673828125, + "learning_rate": 9.089949833123098e-07, + "loss": 0.088, + "num_tokens": 798601596.0, + "reward": 1.4291294813156128, + "reward_std": 0.4090938866138458, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.23006939888000488, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1070.0648193359375, + "completions/mean_terminated_length": 854.2261352539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2859730435245858, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12141747425550446, + "kl": 0.01885986328125, + "learning_rate": 9.087930313813977e-07, + "loss": 0.0601, + "num_tokens": 799156825.0, + "reward": 1.3119419813156128, + "reward_std": 0.2898409068584442, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.19648860394954681, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 961.1473388671875, + "completions/mean_terminated_length": 789.8346557617188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.28618613819189176, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12548457007195213, + "kl": 0.019561767578125, + "learning_rate": 9.08590880886006e-07, + "loss": 0.0561, + "num_tokens": 799657051.0, + "reward": 1.5351563692092896, + "reward_std": 0.35372021794319153, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.20974001288414001, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1017.7098388671875, + "completions/mean_terminated_length": 870.5255126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2863992328591977, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1262389678285119, + "kl": 0.0179443359375, + "learning_rate": 9.083885319380095e-07, + "loss": 0.059, + "num_tokens": 800182633.0, + "reward": 1.5106027126312256, + "reward_std": 0.33969682455062866, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.19061264395713806, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1118.703125, + "completions/mean_terminated_length": 897.9309692382812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2866123275265037, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12448776595543022, + "kl": 0.017852783203125, + "learning_rate": 9.08185984649393e-07, + "loss": 0.115, + "num_tokens": 800755572.0, + "reward": 1.4335938692092896, + "reward_std": 0.37306830286979675, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.2255041003227234, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1066.2366943359375, + "completions/mean_terminated_length": 842.986328125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2868254221938096, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12429465381767626, + "kl": 0.017669677734375, + "learning_rate": 9.079832391322506e-07, + "loss": 0.0796, + "num_tokens": 801302222.0, + "reward": 1.4051339626312256, + "reward_std": 0.3277166485786438, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.2131679654121399, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 905.7835083007812, + "completions/mean_terminated_length": 745.9312744140625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.28703851686111553, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1236832228664559, + "kl": 0.019989013671875, + "learning_rate": 9.077802954987868e-07, + "loss": 0.005, + "num_tokens": 801775261.0, + "reward": 1.5407366752624512, + "reward_std": 0.2504132390022278, + "rewards/accuracy_reward/mean": 0.6466346383094788, + "rewards/accuracy_reward/std": 0.4785905182361603, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.17596179246902466, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1054.0045166015625, + "completions/mean_terminated_length": 873.0396118164062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2872516115284215, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12909507788107594, + "kl": 0.019622802734375, + "learning_rate": 9.07577153861315e-07, + "loss": 0.1357, + "num_tokens": 802318431.0, + "reward": 1.5161831378936768, + "reward_std": 0.3458826243877411, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8889508843421936, + "rewards/tag_count_reward/std": 0.25267162919044495, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 916.2254638671875, + "completions/mean_terminated_length": 777.235595703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28746470619572745, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14159079616731926, + "kl": 0.020599365234375, + "learning_rate": 9.073738143322589e-07, + "loss": 0.0638, + "num_tokens": 802790516.0, + "reward": 1.5887277126312256, + "reward_std": 0.3157961368560791, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18257060647010803, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 943.5245971679688, + "completions/mean_terminated_length": 807.88720703125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2876778008630334, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14104431899326125, + "kl": 0.022491455078125, + "learning_rate": 9.071702770241512e-07, + "loss": 0.0503, + "num_tokens": 803284319.0, + "reward": 1.5664063692092896, + "reward_std": 0.2470361292362213, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.17317995429039001, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1070.6920166015625, + "completions/mean_terminated_length": 854.9918212890625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.28789089553033936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13545160650419127, + "kl": 0.018280029296875, + "learning_rate": 9.069665420496341e-07, + "loss": 0.0867, + "num_tokens": 803831637.0, + "reward": 1.4587054252624512, + "reward_std": 0.3756580948829651, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.20986275374889374, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 954.6629638671875, + "completions/mean_terminated_length": 772.4401245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2881039901976453, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13007115145869658, + "kl": 0.0203857421875, + "learning_rate": 9.067626095214596e-07, + "loss": 0.0847, + "num_tokens": 804320430.0, + "reward": 1.5535714626312256, + "reward_std": 0.334953248500824, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.18918029963970184, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1087.0848388671875, + "completions/mean_terminated_length": 858.8011474609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2883170848649513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12294409359863658, + "kl": 0.0164794921875, + "learning_rate": 9.065584795524884e-07, + "loss": 0.1207, + "num_tokens": 804877348.0, + "reward": 1.4704241752624512, + "reward_std": 0.37603843212127686, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.1959095597267151, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1085.5223388671875, + "completions/mean_terminated_length": 888.8870849609375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2885301795322572, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12061608385179286, + "kl": 0.0160369873046875, + "learning_rate": 9.06354152255691e-07, + "loss": 0.0561, + "num_tokens": 805430622.0, + "reward": 1.3744419813156128, + "reward_std": 0.3371695280075073, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960493743419647, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1922689974308014, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 995.87060546875, + "completions/mean_terminated_length": 823.703857421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28874327419956314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13305217535225422, + "kl": 0.023223876953125, + "learning_rate": 9.06149627744147e-07, + "loss": 0.0669, + "num_tokens": 805941076.0, + "reward": 1.5892857313156128, + "reward_std": 0.34248897433280945, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.16685131192207336, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1125.46875, + "completions/mean_terminated_length": 887.0618286132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2889563688668691, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13760718076730524, + "kl": 0.01983642578125, + "learning_rate": 9.059449061310451e-07, + "loss": 0.0732, + "num_tokens": 806506358.0, + "reward": 1.25, + "reward_std": 0.31314703822135925, + "rewards/accuracy_reward/mean": 0.3236607015132904, + "rewards/accuracy_reward/std": 0.46839529275894165, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.20789341628551483, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1028.03125, + "completions/mean_terminated_length": 809.6640014648438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28916946353417505, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11335865602688269, + "kl": 0.0175323486328125, + "learning_rate": 9.057399875296827e-07, + "loss": 0.037, + "num_tokens": 807038260.0, + "reward": 1.3035714626312256, + "reward_std": 0.3286227583885193, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.19061346352100372, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1048.4888916015625, + "completions/mean_terminated_length": 872.7218017578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.289382558201481, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12460983050952815, + "kl": 0.0170440673828125, + "learning_rate": 9.055348720534668e-07, + "loss": 0.0322, + "num_tokens": 807577823.0, + "reward": 1.3984376192092896, + "reward_std": 0.31667280197143555, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16746000945568085, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1135.9442138671875, + "completions/mean_terminated_length": 839.121337890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.28959565286878697, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11633832651189803, + "kl": 0.01690673828125, + "learning_rate": 9.053295598159133e-07, + "loss": 0.0759, + "num_tokens": 808158118.0, + "reward": 1.3203125, + "reward_std": 0.3191813826560974, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.2081664800643921, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1106.80810546875, + "completions/mean_terminated_length": 846.70654296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2898087475360929, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12601665168675036, + "kl": 0.0165863037109375, + "learning_rate": 9.051240509306463e-07, + "loss": 0.0793, + "num_tokens": 808725488.0, + "reward": 1.4017857313156128, + "reward_std": 0.3301330804824829, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1964321881532669, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1010.7053833007812, + "completions/mean_terminated_length": 778.3059692382812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.2900218422033989, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11951829546230916, + "kl": 0.017578125, + "learning_rate": 9.049183455113998e-07, + "loss": 0.0637, + "num_tokens": 809252044.0, + "reward": 1.32421875, + "reward_std": 0.26720666885375977, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.1865345686674118, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 959.8281860351562, + "completions/mean_terminated_length": 816.9368896484375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2902349368707048, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1410473298302068, + "kl": 0.0213623046875, + "learning_rate": 9.047124436720155e-07, + "loss": 0.0925, + "num_tokens": 809752079.0, + "reward": 1.5686384439468384, + "reward_std": 0.29772481322288513, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.162506565451622, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1069.3817138671875, + "completions/mean_terminated_length": 846.8466186523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29044803153801074, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12894771585055406, + "kl": 0.019378662109375, + "learning_rate": 9.045063455264447e-07, + "loss": 0.092, + "num_tokens": 810298330.0, + "reward": 1.4051339626312256, + "reward_std": 0.3721138536930084, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.21542829275131226, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1070.0648193359375, + "completions/mean_terminated_length": 827.6239624023438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2906611262053167, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11979578056277823, + "kl": 0.018280029296875, + "learning_rate": 9.043000511887467e-07, + "loss": 0.0496, + "num_tokens": 810846599.0, + "reward": 1.4352679252624512, + "reward_std": 0.3696146309375763, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.21985933184623718, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1000.7835083007812, + "completions/mean_terminated_length": 780.0189208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.29087422087262266, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16189542852984404, + "kl": 0.018585205078125, + "learning_rate": 9.040935607730899e-07, + "loss": 0.0766, + "num_tokens": 811366694.0, + "reward": 1.4553571939468384, + "reward_std": 0.31917306780815125, + "rewards/accuracy_reward/mean": 0.5162037014961243, + "rewards/accuracy_reward/std": 0.5003167986869812, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.15736359357833862, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1137.109375, + "completions/mean_terminated_length": 904.9215698242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2910873155399286, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10998254603439232, + "kl": 0.0157012939453125, + "learning_rate": 9.038868743937505e-07, + "loss": 0.0649, + "num_tokens": 811941687.0, + "reward": 1.477678656578064, + "reward_std": 0.32315024733543396, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.17503081262111664, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1096.493408203125, + "completions/mean_terminated_length": 892.783203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2913004102072346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12337981894192616, + "kl": 0.018585205078125, + "learning_rate": 9.036799921651141e-07, + "loss": 0.1296, + "num_tokens": 812502964.0, + "reward": 1.3649554252624512, + "reward_std": 0.36768028140068054, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.20240136981010437, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1118.01123046875, + "completions/mean_terminated_length": 864.3778686523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29151350487454053, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12029299841815014, + "kl": 0.017425537109375, + "learning_rate": 9.034729142016739e-07, + "loss": 0.1162, + "num_tokens": 813074409.0, + "reward": 1.5000001192092896, + "reward_std": 0.4060979187488556, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.18240727484226227, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 902.825927734375, + "completions/mean_terminated_length": 742.559814453125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2917265995418465, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13859788910943882, + "kl": 0.02142333984375, + "learning_rate": 9.032656406180317e-07, + "loss": 0.0852, + "num_tokens": 813536459.0, + "reward": 1.6439732313156128, + "reward_std": 0.34982573986053467, + "rewards/accuracy_reward/mean": 0.7053571343421936, + "rewards/accuracy_reward/std": 0.45639166235923767, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.19608551263809204, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 887.6563110351562, + "completions/mean_terminated_length": 738.5944213867188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2919396942091524, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14003256928640812, + "kl": 0.0211181640625, + "learning_rate": 9.030581715288976e-07, + "loss": 0.1178, + "num_tokens": 813998193.0, + "reward": 1.5027902126312256, + "reward_std": 0.3440830409526825, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20504480600357056, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1045.12060546875, + "completions/mean_terminated_length": 840.231201171875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.29215278887645835, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12547713190171392, + "kl": 0.017303466796875, + "learning_rate": 9.028505070490898e-07, + "loss": 0.0751, + "num_tokens": 814530551.0, + "reward": 1.505022406578064, + "reward_std": 0.3037368059158325, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16576191782951355, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1049.3817138671875, + "completions/mean_terminated_length": 848.587158203125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2923658835437643, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13948451405051082, + "kl": 0.020416259765625, + "learning_rate": 9.026426472935348e-07, + "loss": 0.1093, + "num_tokens": 815063842.0, + "reward": 1.5334821939468384, + "reward_std": 0.35994020104408264, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.2065439224243164, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1155.779052734375, + "completions/mean_terminated_length": 875.8152465820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29257897821107026, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12391504834502774, + "kl": 0.0147705078125, + "learning_rate": 9.024345923772671e-07, + "loss": 0.1135, + "num_tokens": 815660911.0, + "reward": 1.2840402126312256, + "reward_std": 0.36466774344444275, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23758143186569214, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1208.359375, + "completions/mean_terminated_length": 883.4210205078125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2927920728783762, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11496203230591968, + "kl": 0.0152435302734375, + "learning_rate": 9.02226342415429e-07, + "loss": 0.0834, + "num_tokens": 816283776.0, + "reward": 1.3515626192092896, + "reward_std": 0.38633763790130615, + "rewards/accuracy_reward/mean": 0.46759259700775146, + "rewards/accuracy_reward/std": 0.49952712655067444, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9006696343421936, + "rewards/tag_count_reward/std": 0.24499371647834778, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 922.7210083007812, + "completions/mean_terminated_length": 752.048828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2930051675456822, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13564816184001682, + "kl": 0.021087646484375, + "learning_rate": 9.020178975232709e-07, + "loss": 0.1157, + "num_tokens": 816761603.0, + "reward": 1.4810268878936768, + "reward_std": 0.3346594572067261, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.19318638741970062, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1024.165283203125, + "completions/mean_terminated_length": 794.7813720703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29321826221298813, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16033672967602017, + "kl": 0.02008056640625, + "learning_rate": 9.018092578161514e-07, + "loss": 0.1145, + "num_tokens": 817291741.0, + "reward": 1.4196429252624512, + "reward_std": 0.3855610191822052, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.211996391415596, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1182.884033203125, + "completions/mean_terminated_length": 911.4252319335938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2934313568802941, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11748575013782174, + "kl": 0.0154876708984375, + "learning_rate": 9.016004234095362e-07, + "loss": 0.0866, + "num_tokens": 817898969.0, + "reward": 1.3543527126312256, + "reward_std": 0.39249926805496216, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.23329170048236847, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1018.12060546875, + "completions/mean_terminated_length": 827.402099609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2936444515476, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12969368088868707, + "kl": 0.0202178955078125, + "learning_rate": 9.013913944189994e-07, + "loss": 0.1255, + "num_tokens": 818425823.0, + "reward": 1.5117188692092896, + "reward_std": 0.31404152512550354, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.1926516890525818, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 942.6808471679688, + "completions/mean_terminated_length": 744.8869018554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29385754621490595, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12554022741685236, + "kl": 0.017333984375, + "learning_rate": 9.011821709602227e-07, + "loss": 0.1146, + "num_tokens": 818912208.0, + "reward": 1.5513393878936768, + "reward_std": 0.3249755799770355, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.19118894636631012, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1089.977783203125, + "completions/mean_terminated_length": 835.5875854492188, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2940706408822119, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11654853424961242, + "kl": 0.0165252685546875, + "learning_rate": 9.009727531489949e-07, + "loss": 0.0914, + "num_tokens": 819476022.0, + "reward": 1.4866071939468384, + "reward_std": 0.39225301146507263, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20332752168178558, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 881.82373046875, + "completions/mean_terminated_length": 728.6893920898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29428373554951787, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14870577540017033, + "kl": 0.0216064453125, + "learning_rate": 9.007631411012129e-07, + "loss": 0.0785, + "num_tokens": 819935543.0, + "reward": 1.5290179252624512, + "reward_std": 0.3149246871471405, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.1934608370065689, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1035.859375, + "completions/mean_terminated_length": 815.8287963867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2944968302168238, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1279863416971706, + "kl": 0.0184326171875, + "learning_rate": 9.00553334932881e-07, + "loss": 0.1036, + "num_tokens": 820472856.0, + "reward": 1.4447544813156128, + "reward_std": 0.34381943941116333, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21214282512664795, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1069.825927734375, + "completions/mean_terminated_length": 863.6162109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2947099248841298, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12909815590518994, + "kl": 0.0171661376953125, + "learning_rate": 9.003433347601108e-07, + "loss": 0.1012, + "num_tokens": 821022554.0, + "reward": 1.3755581378936768, + "reward_std": 0.3863195776939392, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.22320599853992462, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1051.138427734375, + "completions/mean_terminated_length": 775.6524047851562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.29492301955143574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33523925290709916, + "kl": 0.01800537109375, + "learning_rate": 9.001331406991212e-07, + "loss": 0.092, + "num_tokens": 821567704.0, + "reward": 1.3169643878936768, + "reward_std": 0.32823944091796875, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.22093555331230164, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 992.8214721679688, + "completions/mean_terminated_length": 770.37841796875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.2951361142187417, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14786096024583684, + "kl": 0.021942138671875, + "learning_rate": 8.999227528662388e-07, + "loss": 0.0938, + "num_tokens": 822087192.0, + "reward": 1.5351563692092896, + "reward_std": 0.31625211238861084, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17932796478271484, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1051.263427734375, + "completions/mean_terminated_length": 857.2319946289062, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.29534920888604765, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11995310838117135, + "kl": 0.01861572265625, + "learning_rate": 8.997121713778968e-07, + "loss": 0.0878, + "num_tokens": 822625646.0, + "reward": 1.4575893878936768, + "reward_std": 0.29564735293388367, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.18991795182228088, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1031.4129638671875, + "completions/mean_terminated_length": 807.0435791015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29556230355335356, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12389574815147643, + "kl": 0.01812744140625, + "learning_rate": 8.995013963506362e-07, + "loss": 0.1128, + "num_tokens": 823161543.0, + "reward": 1.536272406578064, + "reward_std": 0.3453434109687805, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.19750888645648956, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1004.1406860351562, + "completions/mean_terminated_length": 814.09765625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.2957753982206595, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 1.1377261640978062, + "kl": 0.08575439453125, + "learning_rate": 8.992904279011048e-07, + "loss": 0.1233, + "num_tokens": 823686262.0, + "reward": 1.5055804252624512, + "reward_std": 0.3839109539985657, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.23246414959430695, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 982.5379638671875, + "completions/mean_terminated_length": 848.6859130859375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.29598849288796547, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1296868288252111, + "kl": 0.01849365234375, + "learning_rate": 8.990792661460575e-07, + "loss": 0.1026, + "num_tokens": 824187191.0, + "reward": 1.6205357313156128, + "reward_std": 0.3464025855064392, + "rewards/accuracy_reward/mean": 0.6785714030265808, + "rewards/accuracy_reward/std": 0.4675469696521759, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.16965599358081818, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1049.513427734375, + "completions/mean_terminated_length": 822.4602661132812, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.29620158755527143, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12193831873142236, + "kl": 0.016998291015625, + "learning_rate": 8.98867911202356e-07, + "loss": 0.074, + "num_tokens": 824729789.0, + "reward": 1.3967634439468384, + "reward_std": 0.34617260098457336, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.2049778252840042, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 993.6027221679688, + "completions/mean_terminated_length": 833.6812133789062, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2964146822225774, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13133356104270477, + "kl": 0.01776123046875, + "learning_rate": 8.986563631869693e-07, + "loss": 0.0919, + "num_tokens": 825244571.0, + "reward": 1.4469866752624512, + "reward_std": 0.36387813091278076, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19768576323986053, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1032.3192138671875, + "completions/mean_terminated_length": 801.356201171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29662777688988334, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13786845038553006, + "kl": 0.0186767578125, + "learning_rate": 8.984446222169729e-07, + "loss": 0.1207, + "num_tokens": 825776954.0, + "reward": 1.3431919813156128, + "reward_std": 0.3439805209636688, + "rewards/accuracy_reward/mean": 0.4236111044883728, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19445767998695374, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1010.3303833007812, + "completions/mean_terminated_length": 788.1734619140625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.2968408715571893, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11964439136483068, + "kl": 0.017242431640625, + "learning_rate": 8.982326884095492e-07, + "loss": 0.0723, + "num_tokens": 826298686.0, + "reward": 1.4732143878936768, + "reward_std": 0.29365402460098267, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19774970412254333, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1014.1272583007812, + "completions/mean_terminated_length": 806.2440185546875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.29705396622449526, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13063972878830846, + "kl": 0.0179443359375, + "learning_rate": 8.980205618819877e-07, + "loss": 0.0632, + "num_tokens": 826820199.0, + "reward": 1.4916294813156128, + "reward_std": 0.34952548146247864, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.15956845879554749, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1100.560302734375, + "completions/mean_terminated_length": 821.2572021484375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.29726706089180116, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1983475992856638, + "kl": 0.019012451171875, + "learning_rate": 8.978082427516837e-07, + "loss": 0.1442, + "num_tokens": 827387810.0, + "reward": 1.4536831378936768, + "reward_std": 0.381161093711853, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.22741149365901947, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1134.140625, + "completions/mean_terminated_length": 884.90625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2974801555591071, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11928166269437718, + "kl": 0.0169830322265625, + "learning_rate": 8.975957311361398e-07, + "loss": 0.0339, + "num_tokens": 827967185.0, + "reward": 1.3571429252624512, + "reward_std": 0.36671510338783264, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1724584847688675, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 988.3772583007812, + "completions/mean_terminated_length": 758.0244750976562, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.2976932502264131, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12996950343475697, + "kl": 0.0191650390625, + "learning_rate": 8.973830271529649e-07, + "loss": 0.1172, + "num_tokens": 828478618.0, + "reward": 1.4899554252624512, + "reward_std": 0.28695932030677795, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.16277234256267548, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 990.5692138671875, + "completions/mean_terminated_length": 807.8717651367188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29790634489371903, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11925011063101904, + "kl": 0.0163421630859375, + "learning_rate": 8.971701309198742e-07, + "loss": 0.0554, + "num_tokens": 828993209.0, + "reward": 1.5580357313156128, + "reward_std": 0.3501366376876831, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15220165252685547, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1007.450927734375, + "completions/mean_terminated_length": 827.670166015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.298119439561025, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13017080383554278, + "kl": 0.019744873046875, + "learning_rate": 8.9695704255469e-07, + "loss": 0.1083, + "num_tokens": 829518163.0, + "reward": 1.5178571939468384, + "reward_std": 0.37711983919143677, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1928403526544571, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1030.9442138671875, + "completions/mean_terminated_length": 771.6947021484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29833253422833095, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16500231690753925, + "kl": 0.0185546875, + "learning_rate": 8.967437621753398e-07, + "loss": 0.1594, + "num_tokens": 830046042.0, + "reward": 1.4369419813156128, + "reward_std": 0.35246893763542175, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19662198424339294, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 875.685302734375, + "completions/mean_terminated_length": 748.0073852539062, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2985456288956369, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13329389179295378, + "kl": 0.020660400390625, + "learning_rate": 8.965302898998581e-07, + "loss": 0.0713, + "num_tokens": 830506173.0, + "reward": 1.5429688692092896, + "reward_std": 0.3309931755065918, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.17006663978099823, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 957.9777221679688, + "completions/mean_terminated_length": 789.41748046875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.29875872356294286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14822042720898254, + "kl": 0.019866943359375, + "learning_rate": 8.963166258463859e-07, + "loss": 0.0955, + "num_tokens": 831006099.0, + "reward": 1.5239956378936768, + "reward_std": 0.3759874403476715, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17854657769203186, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 969.15185546875, + "completions/mean_terminated_length": 772.7388305664062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.29897181823024876, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.17107355009678538, + "kl": 0.02093505859375, + "learning_rate": 8.961027701331693e-07, + "loss": 0.1572, + "num_tokens": 831504551.0, + "reward": 1.3850446939468384, + "reward_std": 0.2944815158843994, + "rewards/accuracy_reward/mean": 0.46990740299224854, + "rewards/accuracy_reward/std": 0.4996722638607025, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.19884200394153595, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 999.8348388671875, + "completions/mean_terminated_length": 802.4349975585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2991849128975547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38181681629980196, + "kl": 0.022247314453125, + "learning_rate": 8.958887228785615e-07, + "loss": 0.0852, + "num_tokens": 832024333.0, + "reward": 1.3325893878936768, + "reward_std": 0.387983500957489, + "rewards/accuracy_reward/mean": 0.42824074625968933, + "rewards/accuracy_reward/std": 0.4953974783420563, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.2067493349313736, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1125.2366943359375, + "completions/mean_terminated_length": 856.6512451171875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2993980075648607, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11975598424664902, + "kl": 0.016448974609375, + "learning_rate": 8.95674484201021e-07, + "loss": 0.1086, + "num_tokens": 832595943.0, + "reward": 1.3883929252624512, + "reward_std": 0.3373308479785919, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.2012539505958557, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 945.3638916015625, + "completions/mean_terminated_length": 778.1259765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.29961110223216664, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13186394636310558, + "kl": 0.0201416015625, + "learning_rate": 8.954600542191128e-07, + "loss": 0.0659, + "num_tokens": 833087578.0, + "reward": 1.5033482313156128, + "reward_std": 0.36830461025238037, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.1880783587694168, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 976.5469360351562, + "completions/mean_terminated_length": 736.4945068359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2998241968994726, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12730494152164304, + "kl": 0.018829345703125, + "learning_rate": 8.952454330515072e-07, + "loss": 0.0926, + "num_tokens": 833593775.0, + "reward": 1.4575893878936768, + "reward_std": 0.33134713768959045, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1884397715330124, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1073.37060546875, + "completions/mean_terminated_length": 874.252685546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30003729156677855, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12345956694277439, + "kl": 0.01751708984375, + "learning_rate": 8.950306208169805e-07, + "loss": 0.0634, + "num_tokens": 834140805.0, + "reward": 1.3616071939468384, + "reward_std": 0.35382047295570374, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.18987849354743958, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1102.6741943359375, + "completions/mean_terminated_length": 881.3168334960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3002503862340845, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11783133569107644, + "kl": 0.0163421630859375, + "learning_rate": 8.948156176344154e-07, + "loss": 0.0644, + "num_tokens": 834703107.0, + "reward": 1.3426339626312256, + "reward_std": 0.37391969561576843, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.20146164298057556, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1113.5513916015625, + "completions/mean_terminated_length": 809.4408569335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.30046348090139047, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11731171392788947, + "kl": 0.0167999267578125, + "learning_rate": 8.94600423622799e-07, + "loss": 0.0591, + "num_tokens": 835272682.0, + "reward": 1.3945313692092896, + "reward_std": 0.39762619137763977, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.221673846244812, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 1082.513427734375, + "completions/mean_terminated_length": 878.9783935546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30067657556869637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10843301400402108, + "kl": 0.017425537109375, + "learning_rate": 8.943850389012252e-07, + "loss": 0.0474, + "num_tokens": 835822624.0, + "reward": 1.489397406578064, + "reward_std": 0.36612966656684875, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.18964743614196777, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1047.0513916015625, + "completions/mean_terminated_length": 855.3803100585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3008896702360023, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12435930285702078, + "kl": 0.018768310546875, + "learning_rate": 8.941694635888928e-07, + "loss": 0.1483, + "num_tokens": 836357031.0, + "reward": 1.399553656578064, + "reward_std": 0.3439539968967438, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.21858371794223785, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1026.9554443359375, + "completions/mean_terminated_length": 824.9304809570312, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3011027649033083, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.127661821463488, + "kl": 0.01885986328125, + "learning_rate": 8.939536978051062e-07, + "loss": 0.0736, + "num_tokens": 836896739.0, + "reward": 1.5228794813156128, + "reward_std": 0.3001701533794403, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.17181210219860077, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1002.4710083007812, + "completions/mean_terminated_length": 825.0313720703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.30131585957061424, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13035861436231824, + "kl": 0.0194091796875, + "learning_rate": 8.937377416692752e-07, + "loss": 0.0618, + "num_tokens": 837417414.0, + "reward": 1.5223214626312256, + "reward_std": 0.27997490763664246, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.17525888979434967, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1007.7388916015625, + "completions/mean_terminated_length": 824.8057861328125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.3015289542379202, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13182614690024136, + "kl": 0.02020263671875, + "learning_rate": 8.935215953009151e-07, + "loss": 0.085, + "num_tokens": 837931617.0, + "reward": 1.4313616752624512, + "reward_std": 0.2713320255279541, + "rewards/accuracy_reward/mean": 0.5231481194496155, + "rewards/accuracy_reward/std": 0.5000429749488831, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.19669181108474731, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1089.71435546875, + "completions/mean_terminated_length": 817.8796997070312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30174204890522616, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14171619584956388, + "kl": 0.019561767578125, + "learning_rate": 8.933052588196464e-07, + "loss": 0.0978, + "num_tokens": 838497617.0, + "reward": 1.2600446939468384, + "reward_std": 0.3605620563030243, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.21962924301624298, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 986.1897583007812, + "completions/mean_terminated_length": 786.2201538085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3019551435725321, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11813334522327122, + "kl": 0.020294189453125, + "learning_rate": 8.930887323451947e-07, + "loss": 0.0675, + "num_tokens": 839009142.0, + "reward": 1.4302456378936768, + "reward_std": 0.40599361062049866, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.22104217112064362, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1133.3326416015625, + "completions/mean_terminated_length": 883.8778686523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30216823823983807, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13537016539897972, + "kl": 0.017669677734375, + "learning_rate": 8.928720159973908e-07, + "loss": 0.1098, + "num_tokens": 839592987.0, + "reward": 1.3348214626312256, + "reward_std": 0.38245120644569397, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9017857313156128, + "rewards/tag_count_reward/std": 0.23909583687782288, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1329.9866943359375, + "completions/mean_terminated_length": 996.7908325195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.302381332907144, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11230467899749877, + "kl": 0.013580322265625, + "learning_rate": 8.926551098961708e-07, + "loss": 0.0544, + "num_tokens": 840263525.0, + "reward": 1.2991071939468384, + "reward_std": 0.38015004992485046, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.21875499188899994, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1071.84375, + "completions/mean_terminated_length": 802.0797729492188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.30259442757444993, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14000220224572366, + "kl": 0.02001953125, + "learning_rate": 8.924380141615753e-07, + "loss": 0.0884, + "num_tokens": 840819663.0, + "reward": 1.2310268878936768, + "reward_std": 0.3986087143421173, + "rewards/accuracy_reward/mean": 0.35879629850387573, + "rewards/accuracy_reward/std": 0.48020341992378235, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8850446343421936, + "rewards/tag_count_reward/std": 0.24839438498020172, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1129.169677734375, + "completions/mean_terminated_length": 871.8971557617188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3028075222417559, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12882822493613516, + "kl": 0.0179443359375, + "learning_rate": 8.922207289137504e-07, + "loss": 0.0871, + "num_tokens": 841398331.0, + "reward": 1.3470982313156128, + "reward_std": 0.3199007511138916, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.2269863337278366, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1031.1473388671875, + "completions/mean_terminated_length": 833.2000122070312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.30302061690906185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13745037336854254, + "kl": 0.01898193359375, + "learning_rate": 8.920032542729468e-07, + "loss": 0.1115, + "num_tokens": 841929501.0, + "reward": 1.5027902126312256, + "reward_std": 0.4292661249637604, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.23853585124015808, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 981.5625610351562, + "completions/mean_terminated_length": 826.09716796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3032337115763678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15454235007165662, + "kl": 0.020843505859375, + "learning_rate": 8.917855903595202e-07, + "loss": 0.09, + "num_tokens": 842446201.0, + "reward": 1.3376116752624512, + "reward_std": 0.3645228445529938, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.21985293924808502, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1202.171875, + "completions/mean_terminated_length": 906.6415405273438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.30344680624367376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39169152412779784, + "kl": 0.026580810546875, + "learning_rate": 8.915677372939306e-07, + "loss": 0.054, + "num_tokens": 843066710.0, + "reward": 1.3247768878936768, + "reward_std": 0.3521655201911926, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9095982313156128, + "rewards/tag_count_reward/std": 0.23394164443016052, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 997.5670166015625, + "completions/mean_terminated_length": 856.622802734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3036599009109797, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12774237753487433, + "kl": 0.020965576171875, + "learning_rate": 8.913496951967434e-07, + "loss": 0.0988, + "num_tokens": 843588500.0, + "reward": 1.6155134439468384, + "reward_std": 0.3377233147621155, + "rewards/accuracy_reward/mean": 0.6964285969734192, + "rewards/accuracy_reward/std": 0.4603137969970703, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.2115476429462433, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1059.5045166015625, + "completions/mean_terminated_length": 918.290771484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3038729955782857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11622874434177637, + "kl": 0.019317626953125, + "learning_rate": 8.911314641886279e-07, + "loss": 0.0491, + "num_tokens": 844132358.0, + "reward": 1.5552456378936768, + "reward_std": 0.34310492873191833, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.1903373748064041, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 950.9888916015625, + "completions/mean_terminated_length": 787.8436279296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3040860902455916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14548367393550904, + "kl": 0.021087646484375, + "learning_rate": 8.909130443903583e-07, + "loss": 0.0942, + "num_tokens": 844628193.0, + "reward": 1.5474331378936768, + "reward_std": 0.3538978695869446, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20298878848552704, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 940.79248046875, + "completions/mean_terminated_length": 772.8612060546875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.30429918491289754, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14924011144771562, + "kl": 0.022491455078125, + "learning_rate": 8.906944359228133e-07, + "loss": 0.0863, + "num_tokens": 845116692.0, + "reward": 1.4754464626312256, + "reward_std": 0.34190747141838074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.17203812301158905, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1048.3125, + "completions/mean_terminated_length": 814.2258911132812, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3045122795802035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1324983142377475, + "kl": 0.018096923828125, + "learning_rate": 8.904756389069762e-07, + "loss": 0.1439, + "num_tokens": 845663616.0, + "reward": 1.3688616752624512, + "reward_std": 0.42432478070259094, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8889508843421936, + "rewards/tag_count_reward/std": 0.2493293285369873, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1089.1160888671875, + "completions/mean_terminated_length": 893.2150268554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30472537424750945, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12359533576319118, + "kl": 0.0184326171875, + "learning_rate": 8.902566534639339e-07, + "loss": 0.0769, + "num_tokens": 846225460.0, + "reward": 1.31640625, + "reward_std": 0.34393852949142456, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2005011886358261, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 998.60498046875, + "completions/mean_terminated_length": 784.2123413085938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3049384689148154, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.20252959677262336, + "kl": 0.02154541015625, + "learning_rate": 8.900374797148784e-07, + "loss": 0.0606, + "num_tokens": 846743843.0, + "reward": 1.4547991752624512, + "reward_std": 0.3482241928577423, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.1992206871509552, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 910.3348388671875, + "completions/mean_terminated_length": 751.1195678710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30515156358212137, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13393633284827047, + "kl": 0.02105712890625, + "learning_rate": 8.898181177811056e-07, + "loss": 0.0823, + "num_tokens": 847221081.0, + "reward": 1.469866156578064, + "reward_std": 0.3524143397808075, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.199243426322937, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 969.0335083007812, + "completions/mean_terminated_length": 824.2607421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3053646582494273, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12199333819665728, + "kl": 0.01800537109375, + "learning_rate": 8.895985677840153e-07, + "loss": 0.0557, + "num_tokens": 847716792.0, + "reward": 1.575334906578064, + "reward_std": 0.4001903831958771, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1747443825006485, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1069.8170166015625, + "completions/mean_terminated_length": 806.5665893554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3055777529167333, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 1.7986631416389125, + "kl": 0.01934814453125, + "learning_rate": 8.89378829845112e-07, + "loss": 0.0716, + "num_tokens": 848264230.0, + "reward": 1.4302456378936768, + "reward_std": 0.3747895359992981, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.23921002447605133, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 982.71435546875, + "completions/mean_terminated_length": 771.9358520507812, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3057908475840392, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12503018963776655, + "kl": 0.018768310546875, + "learning_rate": 8.891589040860035e-07, + "loss": 0.039, + "num_tokens": 848773622.0, + "reward": 1.4626116752624512, + "reward_std": 0.3829914629459381, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.2086479365825653, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 938.7678833007812, + "completions/mean_terminated_length": 760.6010131835938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.30600394225134514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13032465986521613, + "kl": 0.01953125, + "learning_rate": 8.88938790628402e-07, + "loss": 0.0855, + "num_tokens": 849261438.0, + "reward": 1.5200893878936768, + "reward_std": 0.34632810950279236, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20075708627700806, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 971.5647583007812, + "completions/mean_terminated_length": 778.9395141601562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3062170369186511, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12226323904790523, + "kl": 0.01904296875, + "learning_rate": 8.887184895941234e-07, + "loss": 0.0517, + "num_tokens": 849766811.0, + "reward": 1.5664063692092896, + "reward_std": 0.3269766867160797, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.16479510068893433, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1100.609375, + "completions/mean_terminated_length": 862.4385375976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30643013158595706, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12605662153042588, + "kl": 0.016448974609375, + "learning_rate": 8.884980011050876e-07, + "loss": 0.1001, + "num_tokens": 850329948.0, + "reward": 1.4341518878936768, + "reward_std": 0.3133377730846405, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9118303656578064, + "rewards/tag_count_reward/std": 0.23598192632198334, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 995.6183471679688, + "completions/mean_terminated_length": 804.0238037109375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.306643226253263, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13059056908956307, + "kl": 0.017791748046875, + "learning_rate": 8.882773252833177e-07, + "loss": 0.096, + "num_tokens": 850846849.0, + "reward": 1.4280134439468384, + "reward_std": 0.41415274143218994, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9123883843421936, + "rewards/tag_count_reward/std": 0.22682885825634003, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 942.2969360351562, + "completions/mean_terminated_length": 790.7537841796875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.30685632092056897, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12149133384470984, + "kl": 0.02166748046875, + "learning_rate": 8.880564622509415e-07, + "loss": 0.025, + "num_tokens": 851333766.0, + "reward": 1.5368304252624512, + "reward_std": 0.2814602255821228, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.18040810525417328, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1174.8013916015625, + "completions/mean_terminated_length": 958.3258666992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3070694155878749, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11595228758811701, + "kl": 0.0159149169921875, + "learning_rate": 8.878354121301893e-07, + "loss": 0.0889, + "num_tokens": 851931405.0, + "reward": 1.4012277126312256, + "reward_std": 0.3647383451461792, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.21759268641471863, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1204.3504638671875, + "completions/mean_terminated_length": 888.6287841796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3072825102551809, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12719975783456006, + "kl": 0.0158233642578125, + "learning_rate": 8.876141750433957e-07, + "loss": 0.0943, + "num_tokens": 852544426.0, + "reward": 1.3632813692092896, + "reward_std": 0.3926142454147339, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2444191426038742, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1024.5625, + "completions/mean_terminated_length": 841.4210815429688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3074956049224868, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1344979547262212, + "kl": 0.020660400390625, + "learning_rate": 8.873927511129985e-07, + "loss": 0.0852, + "num_tokens": 853074342.0, + "reward": 1.493303656578064, + "reward_std": 0.4106661379337311, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8839285969734192, + "rewards/tag_count_reward/std": 0.2506782114505768, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1126.8013916015625, + "completions/mean_terminated_length": 865.4871215820312, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.30770869958979274, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11972460647217888, + "kl": 0.016265869140625, + "learning_rate": 8.871711404615385e-07, + "loss": 0.1005, + "num_tokens": 853650717.0, + "reward": 1.3493304252624512, + "reward_std": 0.35324156284332275, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.2245088666677475, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 966.37060546875, + "completions/mean_terminated_length": 839.5960083007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3079217942570987, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13618951326020987, + "kl": 0.01934814453125, + "learning_rate": 8.869493432116606e-07, + "loss": 0.0628, + "num_tokens": 854149171.0, + "reward": 1.5758929252624512, + "reward_std": 0.3624713718891144, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.2171051949262619, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1086.8035888671875, + "completions/mean_terminated_length": 861.7300415039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30813488892440466, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12460285505484467, + "kl": 0.0168914794921875, + "learning_rate": 8.867273594861126e-07, + "loss": 0.1097, + "num_tokens": 854706651.0, + "reward": 1.3828126192092896, + "reward_std": 0.29108303785324097, + "rewards/accuracy_reward/mean": 0.48148149251937866, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9185267686843872, + "rewards/tag_count_reward/std": 0.2207179069519043, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 978.16748046875, + "completions/mean_terminated_length": 766.4893188476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3083479835917106, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12489247846440525, + "kl": 0.02056884765625, + "learning_rate": 8.865051894077452e-07, + "loss": 0.0686, + "num_tokens": 855213094.0, + "reward": 1.4341518878936768, + "reward_std": 0.3647061288356781, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9095982313156128, + "rewards/tag_count_reward/std": 0.22541894018650055, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 937.71435546875, + "completions/mean_terminated_length": 752.6666870117188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3085610782590166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13967658804301156, + "kl": 0.02001953125, + "learning_rate": 8.862828330995129e-07, + "loss": 0.1596, + "num_tokens": 855696086.0, + "reward": 1.5178571939468384, + "reward_std": 0.35910525918006897, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.2277139127254486, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1096.8504638671875, + "completions/mean_terminated_length": 861.0501098632812, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.30877417292632253, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11889917322130462, + "kl": 0.0165252685546875, + "learning_rate": 8.860602906844726e-07, + "loss": 0.1172, + "num_tokens": 856258499.0, + "reward": 1.442522406578064, + "reward_std": 0.3507930338382721, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.19407851994037628, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1149.5826416015625, + "completions/mean_terminated_length": 929.969482421875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.3089872675936285, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12079038909325925, + "kl": 0.015411376953125, + "learning_rate": 8.858375622857847e-07, + "loss": 0.0608, + "num_tokens": 856851944.0, + "reward": 1.3342634439468384, + "reward_std": 0.35259315371513367, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.22604598104953766, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 979.0402221679688, + "completions/mean_terminated_length": 794.3507690429688, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.30920036226093445, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15140166753525355, + "kl": 0.020233154296875, + "learning_rate": 8.856146480267124e-07, + "loss": 0.0894, + "num_tokens": 857357562.0, + "reward": 1.5429688692092896, + "reward_std": 0.32985129952430725, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19111628830432892, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1029.4710693359375, + "completions/mean_terminated_length": 844.0396118164062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.30941345692824035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1506919165887795, + "kl": 0.01776123046875, + "learning_rate": 8.853915480306215e-07, + "loss": 0.1038, + "num_tokens": 857890125.0, + "reward": 1.5463169813156128, + "reward_std": 0.3308275640010834, + "rewards/accuracy_reward/mean": 0.6203703880310059, + "rewards/accuracy_reward/std": 0.48585736751556396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.17204447090625763, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1104.41748046875, + "completions/mean_terminated_length": 889.849365234375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.3096265515955463, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12930045866945938, + "kl": 0.01800537109375, + "learning_rate": 8.851682624209806e-07, + "loss": 0.1086, + "num_tokens": 858453880.0, + "reward": 1.4352679252624512, + "reward_std": 0.37976232171058655, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.1891407072544098, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 996.9620971679688, + "completions/mean_terminated_length": 785.6273803710938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.30983964626285226, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1267697512715945, + "kl": 0.019775390625, + "learning_rate": 8.849447913213615e-07, + "loss": 0.0887, + "num_tokens": 858968295.0, + "reward": 1.5424107313156128, + "reward_std": 0.34806954860687256, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.2018609493970871, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1156.212158203125, + "completions/mean_terminated_length": 935.1281127929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3100527409301582, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12480315459571291, + "kl": 0.016510009765625, + "learning_rate": 8.847211348554382e-07, + "loss": 0.0794, + "num_tokens": 859552198.0, + "reward": 1.3694196939468384, + "reward_std": 0.3284815549850464, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18730680644512177, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 953.122802734375, + "completions/mean_terminated_length": 809.3510131835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3102658355974642, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13132016542260447, + "kl": 0.01971435546875, + "learning_rate": 8.844972931469875e-07, + "loss": 0.0786, + "num_tokens": 860047325.0, + "reward": 1.5842634439468384, + "reward_std": 0.3621877431869507, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.21228989958763123, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 965.8795166015625, + "completions/mean_terminated_length": 811.290771484375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.31047893026477014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1259837857832474, + "kl": 0.02093505859375, + "learning_rate": 8.842732663198886e-07, + "loss": 0.096, + "num_tokens": 860551431.0, + "reward": 1.536272406578064, + "reward_std": 0.3589436411857605, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.2096388190984726, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1039.6629638671875, + "completions/mean_terminated_length": 833.6586303710938, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3106920249320761, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13331623789791222, + "kl": 0.01739501953125, + "learning_rate": 8.840490544981234e-07, + "loss": 0.0482, + "num_tokens": 861081632.0, + "reward": 1.4609376192092896, + "reward_std": 0.3299265503883362, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.17640554904937744, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1123.0804443359375, + "completions/mean_terminated_length": 880.7774047851562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.31090511959938205, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12041190745192736, + "kl": 0.015289306640625, + "learning_rate": 8.838246578057757e-07, + "loss": 0.0706, + "num_tokens": 861657444.0, + "reward": 1.5212054252624512, + "reward_std": 0.3582844138145447, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19821317493915558, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1093.15185546875, + "completions/mean_terminated_length": 898.0752563476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.31111821426668795, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12049862447409308, + "kl": 0.01751708984375, + "learning_rate": 8.836000763670319e-07, + "loss": 0.1311, + "num_tokens": 862220024.0, + "reward": 1.4587054252624512, + "reward_std": 0.355456680059433, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.21038557589054108, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1082.078125, + "completions/mean_terminated_length": 835.86279296875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.3113313089339939, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13660460735355878, + "kl": 0.01715087890625, + "learning_rate": 8.833753103061808e-07, + "loss": 0.12, + "num_tokens": 862777771.0, + "reward": 1.4419643878936768, + "reward_std": 0.40680035948753357, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9196428656578064, + "rewards/tag_count_reward/std": 0.22674697637557983, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1010.2567138671875, + "completions/mean_terminated_length": 840.4441528320312, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.31154440360129987, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.140369731126648, + "kl": 0.01898193359375, + "learning_rate": 8.831503597476131e-07, + "loss": 0.088, + "num_tokens": 863300750.0, + "reward": 1.4592634439468384, + "reward_std": 0.35447394847869873, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19590957462787628, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1028.0670166015625, + "completions/mean_terminated_length": 829.52001953125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3117574982686058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13436031129568018, + "kl": 0.018035888671875, + "learning_rate": 8.829252248158219e-07, + "loss": 0.1591, + "num_tokens": 863824716.0, + "reward": 1.4570313692092896, + "reward_std": 0.384103924036026, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.20502042770385742, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1135.0848388671875, + "completions/mean_terminated_length": 869.365966796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3119705929359118, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12471225530275858, + "kl": 0.0172119140625, + "learning_rate": 8.82699905635402e-07, + "loss": 0.061, + "num_tokens": 864397026.0, + "reward": 1.3666294813156128, + "reward_std": 0.35109055042266846, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23580926656723022, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1017.0848388671875, + "completions/mean_terminated_length": 819.6754760742188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.31218368760321774, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13100548626235745, + "kl": 0.019927978515625, + "learning_rate": 8.824744023310504e-07, + "loss": 0.1191, + "num_tokens": 864913064.0, + "reward": 1.5842634439468384, + "reward_std": 0.35711053013801575, + "rewards/accuracy_reward/mean": 0.6651785969734192, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.21352127194404602, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 985.6719360351562, + "completions/mean_terminated_length": 798.8582763671875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3123967822705237, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11966240567919921, + "kl": 0.0216064453125, + "learning_rate": 8.822487150275657e-07, + "loss": 0.11, + "num_tokens": 865421413.0, + "reward": 1.473772406578064, + "reward_std": 0.3350149989128113, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.18451586365699768, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1094.984375, + "completions/mean_terminated_length": 887.8070678710938, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.31260987693782966, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11003275335496294, + "kl": 0.016265869140625, + "learning_rate": 8.820228438498486e-07, + "loss": 0.0617, + "num_tokens": 865980750.0, + "reward": 1.4235491752624512, + "reward_std": 0.3181097209453583, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.18088065087795258, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1196.138427734375, + "completions/mean_terminated_length": 905.3832397460938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.31282297160513556, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12129027857759477, + "kl": 0.0159454345703125, + "learning_rate": 8.817967889229018e-07, + "loss": 0.0761, + "num_tokens": 866591292.0, + "reward": 1.3593751192092896, + "reward_std": 0.31770387291908264, + "rewards/accuracy_reward/mean": 0.44675925374031067, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.20392835140228271, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1013.7902221679688, + "completions/mean_terminated_length": 844.5558471679688, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3130360662724415, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.21719677367013085, + "kl": 0.02008056640625, + "learning_rate": 8.815705503718291e-07, + "loss": 0.0643, + "num_tokens": 867116766.0, + "reward": 1.551897406578064, + "reward_std": 0.3549552857875824, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.20187872648239136, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 978.419677734375, + "completions/mean_terminated_length": 780.3491821289062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3132491609397475, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1426461907580897, + "kl": 0.0190887451171875, + "learning_rate": 8.813441283218365e-07, + "loss": 0.0735, + "num_tokens": 867620394.0, + "reward": 1.4285714626312256, + "reward_std": 0.23254993557929993, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.17920349538326263, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1162.2054443359375, + "completions/mean_terminated_length": 887.6608276367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.31346225560705343, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11672412391722606, + "kl": 0.017486572265625, + "learning_rate": 8.811175228982311e-07, + "loss": 0.1135, + "num_tokens": 868204614.0, + "reward": 1.4492188692092896, + "reward_std": 0.3632122576236725, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.25323429703712463, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1092.0625, + "completions/mean_terminated_length": 810.2543334960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3136753502743594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37867127389765814, + "kl": 0.02203369140625, + "learning_rate": 8.808907342264215e-07, + "loss": 0.0922, + "num_tokens": 868773474.0, + "reward": 1.313616156578064, + "reward_std": 0.32644402980804443, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.1910289078950882, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1127.513427734375, + "completions/mean_terminated_length": 911.9724731445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.31388844494166535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13179886525244894, + "kl": 0.01605224609375, + "learning_rate": 8.806637624319181e-07, + "loss": 0.1, + "num_tokens": 869350568.0, + "reward": 1.4040179252624512, + "reward_std": 0.3833931088447571, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18981274962425232, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1070.79248046875, + "completions/mean_terminated_length": 883.6675415039062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.3141015396089713, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1286639624116333, + "kl": 0.019805908203125, + "learning_rate": 8.804366076403323e-07, + "loss": 0.0651, + "num_tokens": 869902411.0, + "reward": 1.4068081378936768, + "reward_std": 0.31713157892227173, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.20184780657291412, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1086.634033203125, + "completions/mean_terminated_length": 858.2431030273438, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.31431463427627726, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11614156694184209, + "kl": 0.0185546875, + "learning_rate": 8.80209269977377e-07, + "loss": 0.0299, + "num_tokens": 870458551.0, + "reward": 1.5033482313156128, + "reward_std": 0.2739241123199463, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.166563019156456, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1015.8928833007812, + "completions/mean_terminated_length": 801.6819458007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.31452772894358316, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1423653452407796, + "kl": 0.01953125, + "learning_rate": 8.799817495688662e-07, + "loss": 0.1146, + "num_tokens": 870977863.0, + "reward": 1.5267857313156128, + "reward_std": 0.3386031687259674, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.20867261290550232, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1004.3348388671875, + "completions/mean_terminated_length": 767.0082397460938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3147408236108891, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13229236091058097, + "kl": 0.019775390625, + "learning_rate": 8.797540465407148e-07, + "loss": 0.1216, + "num_tokens": 871498429.0, + "reward": 1.4960938692092896, + "reward_std": 0.3151138722896576, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.19176188111305237, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1023.43310546875, + "completions/mean_terminated_length": 820.7112426757812, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3149539182781951, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11991107627388635, + "kl": 0.019500732421875, + "learning_rate": 8.795261610189393e-07, + "loss": 0.0731, + "num_tokens": 872029359.0, + "reward": 1.4252232313156128, + "reward_std": 0.24590043723583221, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.19946885108947754, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1170.4285888671875, + "completions/mean_terminated_length": 924.7085571289062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.31516701294550103, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1170126633602437, + "kl": 0.0150146484375, + "learning_rate": 8.792980931296567e-07, + "loss": 0.0762, + "num_tokens": 872627919.0, + "reward": 1.3493304252624512, + "reward_std": 0.3699823319911957, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.2138228863477707, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1090.509033203125, + "completions/mean_terminated_length": 879.1825561523438, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.315380107612807, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10467877151054188, + "kl": 0.01788330078125, + "learning_rate": 8.790698429990853e-07, + "loss": 0.0278, + "num_tokens": 873185363.0, + "reward": 1.3392857313156128, + "reward_std": 0.2985284924507141, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15527118742465973, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 935.6160888671875, + "completions/mean_terminated_length": 789.5454711914062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.31559320228011295, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12118425502343895, + "kl": 0.020751953125, + "learning_rate": 8.788414107535437e-07, + "loss": 0.0705, + "num_tokens": 873665911.0, + "reward": 1.6367188692092896, + "reward_std": 0.27083009481430054, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.46403056383132935, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16406624019145966, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1045.68310546875, + "completions/mean_terminated_length": 821.1201782226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3158062969474189, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1233821724481232, + "kl": 0.020263671875, + "learning_rate": 8.786127965194519e-07, + "loss": 0.0668, + "num_tokens": 874205593.0, + "reward": 1.4414063692092896, + "reward_std": 0.21451610326766968, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.14879968762397766, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1003.4308471679688, + "completions/mean_terminated_length": 826.154052734375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.31601939161472486, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.2609139189347654, + "kl": 0.0263671875, + "learning_rate": 8.783840004233306e-07, + "loss": 0.0835, + "num_tokens": 874716858.0, + "reward": 1.5837054252624512, + "reward_std": 0.33571502566337585, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.18280036747455597, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1085.9888916015625, + "completions/mean_terminated_length": 813.0974731445312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.31623248628203077, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12123836479665284, + "kl": 0.0180511474609375, + "learning_rate": 8.781550225918008e-07, + "loss": 0.0891, + "num_tokens": 875277397.0, + "reward": 1.422991156578064, + "reward_std": 0.32917022705078125, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.20016857981681824, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1035.3035888671875, + "completions/mean_terminated_length": 825.1212768554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3164455809493367, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11800745621817481, + "kl": 0.01934814453125, + "learning_rate": 8.779258631515837e-07, + "loss": 0.0241, + "num_tokens": 875809997.0, + "reward": 1.4642857313156128, + "reward_std": 0.2984984517097473, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.18769630789756775, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1009.857177734375, + "completions/mean_terminated_length": 833.6710205078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3166586756166427, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13044376500978513, + "kl": 0.019195556640625, + "learning_rate": 8.776965222295023e-07, + "loss": 0.0619, + "num_tokens": 876328317.0, + "reward": 1.5301339626312256, + "reward_std": 0.3145487904548645, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.18126414716243744, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 967.9620971679688, + "completions/mean_terminated_length": 729.5885620117188, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.31687177028394864, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14855206873655472, + "kl": 0.018157958984375, + "learning_rate": 8.774669999524787e-07, + "loss": 0.1464, + "num_tokens": 876828540.0, + "reward": 1.5535714626312256, + "reward_std": 0.3178226947784424, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.1964321881532669, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1243.310302734375, + "completions/mean_terminated_length": 935.3425903320312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3170848649512546, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11817442877012374, + "kl": 0.0160980224609375, + "learning_rate": 8.772372964475362e-07, + "loss": 0.0782, + "num_tokens": 877456615.0, + "reward": 1.2723214626312256, + "reward_std": 0.37030497193336487, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8861607313156128, + "rewards/tag_count_reward/std": 0.2609747648239136, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1049.3773193359375, + "completions/mean_terminated_length": 845.3575439453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.31729795961856055, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12894982420864745, + "kl": 0.0176544189453125, + "learning_rate": 8.770074118417981e-07, + "loss": 0.0673, + "num_tokens": 877995568.0, + "reward": 1.5223214626312256, + "reward_std": 0.3191626965999603, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.21056649088859558, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1030.9554443359375, + "completions/mean_terminated_length": 746.182861328125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3175110542858665, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12753345774012528, + "kl": 0.017669677734375, + "learning_rate": 8.767773462624876e-07, + "loss": 0.1138, + "num_tokens": 878527660.0, + "reward": 1.3476563692092896, + "reward_std": 0.34932416677474976, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.22250016033649445, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 995.2410888671875, + "completions/mean_terminated_length": 813.3507690429688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.31772414895317247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12915485732880436, + "kl": 0.01953125, + "learning_rate": 8.765470998369286e-07, + "loss": 0.1138, + "num_tokens": 879035272.0, + "reward": 1.528459906578064, + "reward_std": 0.3196329176425934, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17854659259319305, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 993.294677734375, + "completions/mean_terminated_length": 781.2225341796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.31793724362047837, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12991545887580488, + "kl": 0.01861572265625, + "learning_rate": 8.76316672692545e-07, + "loss": 0.078, + "num_tokens": 879549996.0, + "reward": 1.547991156578064, + "reward_std": 0.3738611936569214, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.20101501047611237, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1068.118408203125, + "completions/mean_terminated_length": 807.9237060546875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.31815033828778433, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12911229925940468, + "kl": 0.018768310546875, + "learning_rate": 8.760860649568605e-07, + "loss": 0.1133, + "num_tokens": 880094529.0, + "reward": 1.419084906578064, + "reward_std": 0.3558329939842224, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8967633843421936, + "rewards/tag_count_reward/std": 0.25267162919044495, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1023.5870971679688, + "completions/mean_terminated_length": 830.6604614257812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3183634329550903, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1315496382652415, + "kl": 0.017730712890625, + "learning_rate": 8.758552767574988e-07, + "loss": 0.0821, + "num_tokens": 880618920.0, + "reward": 1.4910714626312256, + "reward_std": 0.3212418556213379, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20194751024246216, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 980.8370971679688, + "completions/mean_terminated_length": 779.859375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.31857652762239624, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1288031127749245, + "kl": 0.0185546875, + "learning_rate": 8.756243082221834e-07, + "loss": 0.0803, + "num_tokens": 881128575.0, + "reward": 1.4017857313156128, + "reward_std": 0.2610216736793518, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.18918029963970184, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1080.9910888671875, + "completions/mean_terminated_length": 870.771728515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3187896222897022, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13064185902042835, + "kl": 0.0174560546875, + "learning_rate": 8.75393159478738e-07, + "loss": 0.1431, + "num_tokens": 881678395.0, + "reward": 1.4598214626312256, + "reward_std": 0.42064374685287476, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.23766089975833893, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1060.46435546875, + "completions/mean_terminated_length": 794.6968994140625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.31900271695700816, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11573500887048836, + "kl": 0.016632080078125, + "learning_rate": 8.751618306550855e-07, + "loss": 0.0628, + "num_tokens": 882221195.0, + "reward": 1.4754464626312256, + "reward_std": 0.35096055269241333, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.21043601632118225, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1027.77685546875, + "completions/mean_terminated_length": 799.2021484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3192158116243141, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15932131728619467, + "kl": 0.0181884765625, + "learning_rate": 8.749303218792486e-07, + "loss": 0.0722, + "num_tokens": 882751751.0, + "reward": 1.4207589626312256, + "reward_std": 0.327421098947525, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19321222603321075, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1055.53125, + "completions/mean_terminated_length": 836.4849853515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3194289062916201, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12410364282696955, + "kl": 0.01806640625, + "learning_rate": 8.7469863327935e-07, + "loss": 0.1086, + "num_tokens": 883306277.0, + "reward": 1.4196429252624512, + "reward_std": 0.3600113093852997, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.21043600142002106, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1065.3482666015625, + "completions/mean_terminated_length": 854.97021484375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.319642000958926, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13419866764655478, + "kl": 0.017852783203125, + "learning_rate": 8.744667649836114e-07, + "loss": 0.0812, + "num_tokens": 883850225.0, + "reward": 1.4587054252624512, + "reward_std": 0.3719674050807953, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.18882031738758087, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1178.9710693359375, + "completions/mean_terminated_length": 899.5486450195312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.31985509562623193, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11262308267304726, + "kl": 0.0153350830078125, + "learning_rate": 8.742347171203541e-07, + "loss": 0.0092, + "num_tokens": 884443972.0, + "reward": 1.286272406578064, + "reward_std": 0.372450053691864, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.21360310912132263, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1080.5670166015625, + "completions/mean_terminated_length": 784.4140014648438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3200681902935379, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12876074398984771, + "kl": 0.0179443359375, + "learning_rate": 8.74002489817999e-07, + "loss": 0.0611, + "num_tokens": 885004754.0, + "reward": 1.3934152126312256, + "reward_std": 0.37429845333099365, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.2174951434135437, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1199.529052734375, + "completions/mean_terminated_length": 867.5186767578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.32028128496084385, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11607804143545178, + "kl": 0.0154571533203125, + "learning_rate": 8.73770083205066e-07, + "loss": 0.1213, + "num_tokens": 885606207.0, + "reward": 1.3716518878936768, + "reward_std": 0.3596622049808502, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.22042357921600342, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1093.1138916015625, + "completions/mean_terminated_length": 872.7554931640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3204943796281498, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12971223964835987, + "kl": 0.0174560546875, + "learning_rate": 8.735374974101746e-07, + "loss": 0.1606, + "num_tokens": 886161170.0, + "reward": 1.4614956378936768, + "reward_std": 0.4222429394721985, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9034598469734192, + "rewards/tag_count_reward/std": 0.2353641390800476, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1123.7679443359375, + "completions/mean_terminated_length": 844.3488159179688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.32070747429545576, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13214166459449658, + "kl": 0.0168914794921875, + "learning_rate": 8.73304732562043e-07, + "loss": 0.0892, + "num_tokens": 886732666.0, + "reward": 1.313616156578064, + "reward_std": 0.3497132658958435, + "rewards/accuracy_reward/mean": 0.41435185074806213, + "rewards/accuracy_reward/std": 0.49318093061447144, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2302192598581314, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 945.1183471679688, + "completions/mean_terminated_length": 800.2954711914062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3209205689627617, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13326196932982115, + "kl": 0.020751953125, + "learning_rate": 8.730717887894887e-07, + "loss": 0.0694, + "num_tokens": 887218095.0, + "reward": 1.5954241752624512, + "reward_std": 0.26193109154701233, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17706511914730072, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 941.0178833007812, + "completions/mean_terminated_length": 776.3897705078125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3211336636300677, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1482744019615174, + "kl": 0.017669677734375, + "learning_rate": 8.728386662214284e-07, + "loss": 0.1017, + "num_tokens": 887704055.0, + "reward": 1.4693081378936768, + "reward_std": 0.35213610529899597, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.18900123238563538, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 947.3303833007812, + "completions/mean_terminated_length": 790.091796875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.32134675829737364, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12604777432422962, + "kl": 0.020721435546875, + "learning_rate": 8.726053649868776e-07, + "loss": 0.0869, + "num_tokens": 888203803.0, + "reward": 1.5452009439468384, + "reward_std": 0.36416730284690857, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.19597965478897095, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 984.97998046875, + "completions/mean_terminated_length": 801.3167724609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.32155985296467954, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11593775488836382, + "kl": 0.0166778564453125, + "learning_rate": 8.723718852149506e-07, + "loss": 0.0813, + "num_tokens": 888721138.0, + "reward": 1.3727679252624512, + "reward_std": 0.32188543677330017, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.15467506647109985, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1081.180908203125, + "completions/mean_terminated_length": 858.0687255859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3217729476319855, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1304980965494064, + "kl": 0.0179443359375, + "learning_rate": 8.721382270348604e-07, + "loss": 0.0917, + "num_tokens": 889271523.0, + "reward": 1.4704241752624512, + "reward_std": 0.3440109193325043, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.19078285992145538, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1077.49560546875, + "completions/mean_terminated_length": 866.5162963867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32198604229929145, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12638790336304703, + "kl": 0.016021728515625, + "learning_rate": 8.719043905759193e-07, + "loss": 0.1131, + "num_tokens": 889826753.0, + "reward": 1.4988839626312256, + "reward_std": 0.36405402421951294, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.21358920633792877, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1068.8148193359375, + "completions/mean_terminated_length": 805.2946166992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3221991369665974, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12852018396247916, + "kl": 0.01715087890625, + "learning_rate": 8.716703759675376e-07, + "loss": 0.1277, + "num_tokens": 890378574.0, + "reward": 1.3158482313156128, + "reward_std": 0.3472348749637604, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.2100292444229126, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1003.5022583007812, + "completions/mean_terminated_length": 783.3108520507812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32241223163390337, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12722051238431567, + "kl": 0.0179443359375, + "learning_rate": 8.714361833392246e-07, + "loss": 0.0931, + "num_tokens": 890896031.0, + "reward": 1.3727679252624512, + "reward_std": 0.3549431264400482, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1964321881532669, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 898.294677734375, + "completions/mean_terminated_length": 727.3128662109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3226253263012093, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12412175829217294, + "kl": 0.020660400390625, + "learning_rate": 8.712018128205882e-07, + "loss": 0.0578, + "num_tokens": 891364387.0, + "reward": 1.6568081378936768, + "reward_std": 0.26207587122917175, + "rewards/accuracy_reward/mean": 0.6941964030265808, + "rewards/accuracy_reward/std": 0.4612620174884796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.14432783424854279, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1040.7567138671875, + "completions/mean_terminated_length": 808.3159790039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3228384209685153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12382511975757346, + "kl": 0.018341064453125, + "learning_rate": 8.709672645413339e-07, + "loss": 0.0903, + "num_tokens": 891898438.0, + "reward": 1.5161831378936768, + "reward_std": 0.3817310631275177, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.18738260865211487, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1024.6273193359375, + "completions/mean_terminated_length": 841.4973754882812, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.32305151563582124, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12644921259199857, + "kl": 0.018890380859375, + "learning_rate": 8.707325386312669e-07, + "loss": 0.06, + "num_tokens": 892427679.0, + "reward": 1.3392857313156128, + "reward_std": 0.3286016881465912, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1906658411026001, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 987.0223388671875, + "completions/mean_terminated_length": 745.7589111328125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.32326461030312714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13740912482892004, + "kl": 0.02032470703125, + "learning_rate": 8.704976352202896e-07, + "loss": 0.0765, + "num_tokens": 892944841.0, + "reward": 1.4001116752624512, + "reward_std": 0.3409937918186188, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.20590755343437195, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1041.2076416015625, + "completions/mean_terminated_length": 812.2657470703125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3234777049704331, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1244034618128558, + "kl": 0.01959228515625, + "learning_rate": 8.702625544384034e-07, + "loss": 0.0806, + "num_tokens": 893479014.0, + "reward": 1.4179688692092896, + "reward_std": 0.31756457686424255, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20640410482883453, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1098.6429443359375, + "completions/mean_terminated_length": 863.286865234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32369079963773906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12279058836237525, + "kl": 0.01806640625, + "learning_rate": 8.700272964157072e-07, + "loss": 0.133, + "num_tokens": 894046870.0, + "reward": 1.4168527126312256, + "reward_std": 0.388259619474411, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8967633843421936, + "rewards/tag_count_reward/std": 0.24988947808742523, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1085.6429443359375, + "completions/mean_terminated_length": 853.7174072265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.323903894305045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12250596582811452, + "kl": 0.017059326171875, + "learning_rate": 8.697918612823985e-07, + "loss": 0.1056, + "num_tokens": 894608422.0, + "reward": 1.4754464626312256, + "reward_std": 0.37866777181625366, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855977296829224, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.2012539505958557, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1076.435302734375, + "completions/mean_terminated_length": 871.6189575195312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.324116988972351, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1243985825296824, + "kl": 0.01959228515625, + "learning_rate": 8.695562491687726e-07, + "loss": 0.0999, + "num_tokens": 895155721.0, + "reward": 1.5412946939468384, + "reward_std": 0.3647215962409973, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.2131679654121399, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1125.622802734375, + "completions/mean_terminated_length": 843.262451171875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.32433008363965693, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13405982126286933, + "kl": 0.01788330078125, + "learning_rate": 8.693204602052225e-07, + "loss": 0.1024, + "num_tokens": 895727792.0, + "reward": 1.3632813692092896, + "reward_std": 0.30096372961997986, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19155997037887573, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1018.0491333007812, + "completions/mean_terminated_length": 843.2532958984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3245431783069629, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11830001310244277, + "kl": 0.018768310546875, + "learning_rate": 8.690844945222397e-07, + "loss": 0.0569, + "num_tokens": 896245926.0, + "reward": 1.5424107313156128, + "reward_std": 0.33029937744140625, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.17515915632247925, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 978.24560546875, + "completions/mean_terminated_length": 831.62939453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32475627297426884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13150153059147593, + "kl": 0.020477294921875, + "learning_rate": 8.688483522504131e-07, + "loss": 0.0823, + "num_tokens": 896752228.0, + "reward": 1.567522406578064, + "reward_std": 0.37251636385917664, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19617067277431488, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1131.9866943359375, + "completions/mean_terminated_length": 855.0523071289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.32496936764157475, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6671070064468791, + "kl": 0.0179443359375, + "learning_rate": 8.686120335204291e-07, + "loss": 0.0679, + "num_tokens": 897336030.0, + "reward": 1.387834906578064, + "reward_std": 0.2983705699443817, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1540675312280655, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1053.087158203125, + "completions/mean_terminated_length": 833.5013427734375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3251824623088807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1284438081510803, + "kl": 0.019622802734375, + "learning_rate": 8.683755384630724e-07, + "loss": 0.0953, + "num_tokens": 897883013.0, + "reward": 1.430803656578064, + "reward_std": 0.36074790358543396, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20961573719978333, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1106.8013916015625, + "completions/mean_terminated_length": 850.11083984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32539555697618666, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1358512082235821, + "kl": 0.01702880859375, + "learning_rate": 8.681388672092247e-07, + "loss": 0.1262, + "num_tokens": 898448972.0, + "reward": 1.3766741752624512, + "reward_std": 0.396701455116272, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.2396480143070221, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1009.310302734375, + "completions/mean_terminated_length": 807.1119995117188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3256086516434926, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13397808919569004, + "kl": 0.021820068359375, + "learning_rate": 8.679020198898654e-07, + "loss": 0.076, + "num_tokens": 898969735.0, + "reward": 1.4804688692092896, + "reward_std": 0.30424928665161133, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19111628830432892, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1095.3148193359375, + "completions/mean_terminated_length": 915.8965454101562, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3258217463107986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12875118069556546, + "kl": 0.0185394287109375, + "learning_rate": 8.676649966360714e-07, + "loss": 0.0822, + "num_tokens": 899524980.0, + "reward": 1.5078126192092896, + "reward_std": 0.32376590371131897, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.20024341344833374, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 963.2567138671875, + "completions/mean_terminated_length": 785.7532348632812, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.32603484097810453, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11945135308737737, + "kl": 0.019805908203125, + "learning_rate": 8.67427797579017e-07, + "loss": 0.0858, + "num_tokens": 900030551.0, + "reward": 1.4944196939468384, + "reward_std": 0.27198731899261475, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.15819832682609558, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1041.825927734375, + "completions/mean_terminated_length": 826.4119262695312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3262479356454105, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11744061159742629, + "kl": 0.019317626953125, + "learning_rate": 8.671904228499737e-07, + "loss": 0.0359, + "num_tokens": 900568249.0, + "reward": 1.4804688692092896, + "reward_std": 0.30808940529823303, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18297357857227325, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1057.247802734375, + "completions/mean_terminated_length": 801.210693359375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.32646103031271645, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12485111826515224, + "kl": 0.017364501953125, + "learning_rate": 8.669528725803102e-07, + "loss": 0.0855, + "num_tokens": 901111544.0, + "reward": 1.4001116752624512, + "reward_std": 0.37890103459358215, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.21437901258468628, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 950.7701416015625, + "completions/mean_terminated_length": 740.6622314453125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.32667412498002235, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1724657807321857, + "kl": 0.02105712890625, + "learning_rate": 8.667151469014923e-07, + "loss": 0.1136, + "num_tokens": 901601505.0, + "reward": 1.5976563692092896, + "reward_std": 0.34070268273353577, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18863096833229065, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1034.852783203125, + "completions/mean_terminated_length": 811.2424926757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3268872196473283, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15902213604534607, + "kl": 0.018341064453125, + "learning_rate": 8.664772459450831e-07, + "loss": 0.0698, + "num_tokens": 902135023.0, + "reward": 1.4157366752624512, + "reward_std": 0.34358060359954834, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.1962660849094391, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1070.6898193359375, + "completions/mean_terminated_length": 838.5111083984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32710031431463427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1233316798091126, + "kl": 0.018157958984375, + "learning_rate": 8.662391698427426e-07, + "loss": 0.1105, + "num_tokens": 902679860.0, + "reward": 1.4068081378936768, + "reward_std": 0.38109514117240906, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.23402369022369385, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1028.4241943359375, + "completions/mean_terminated_length": 813.4865112304688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3273134089819402, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13722982106341639, + "kl": 0.0196533203125, + "learning_rate": 8.660009187262277e-07, + "loss": 0.0712, + "num_tokens": 903205618.0, + "reward": 1.5412946939468384, + "reward_std": 0.2835904359817505, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17065013945102692, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1024.375, + "completions/mean_terminated_length": 818.5523071289062, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3275265036492462, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14239299096983507, + "kl": 0.019989013671875, + "learning_rate": 8.657624927273919e-07, + "loss": 0.0839, + "num_tokens": 903730394.0, + "reward": 1.3883929252624512, + "reward_std": 0.2794315218925476, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19922147691249847, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 950.950927734375, + "completions/mean_terminated_length": 774.7409057617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32773959831655214, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12586889278687977, + "kl": 0.019439697265625, + "learning_rate": 8.65523891978186e-07, + "loss": 0.0559, + "num_tokens": 904232180.0, + "reward": 1.5345982313156128, + "reward_std": 0.3146071434020996, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17708362638950348, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1199.015625, + "completions/mean_terminated_length": 909.2425537109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3279526929838581, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12037489857620279, + "kl": 0.016448974609375, + "learning_rate": 8.652851166106573e-07, + "loss": 0.11, + "num_tokens": 904840571.0, + "reward": 1.3950893878936768, + "reward_std": 0.38501065969467163, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.22758229076862335, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1155.6920166015625, + "completions/mean_terminated_length": 899.2816162109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32816578765116405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.125584265905455, + "kl": 0.017059326171875, + "learning_rate": 8.650461667569495e-07, + "loss": 0.1083, + "num_tokens": 905427617.0, + "reward": 1.4481027126312256, + "reward_std": 0.4261804223060608, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.22890526056289673, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1157.665283203125, + "completions/mean_terminated_length": 924.4224853515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32837888231846996, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11284061539316523, + "kl": 0.01806640625, + "learning_rate": 8.648070425493031e-07, + "loss": 0.0466, + "num_tokens": 906014939.0, + "reward": 1.5022321939468384, + "reward_std": 0.35935840010643005, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19424648582935333, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1055.7545166015625, + "completions/mean_terminated_length": 840.0489501953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3285919769857759, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13001606876846988, + "kl": 0.017578125, + "learning_rate": 8.645677441200551e-07, + "loss": 0.0696, + "num_tokens": 906560957.0, + "reward": 1.3738839626312256, + "reward_std": 0.2935698628425598, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17708362638950348, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1166.296875, + "completions/mean_terminated_length": 922.6353149414062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.32880507165308187, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1047703457229358, + "kl": 0.017791748046875, + "learning_rate": 8.643282716016388e-07, + "loss": 0.0467, + "num_tokens": 907152178.0, + "reward": 1.422991156578064, + "reward_std": 0.33721673488616943, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.20895659923553467, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1134.5960693359375, + "completions/mean_terminated_length": 929.9535522460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32901816632038783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11804426593857516, + "kl": 0.019012451171875, + "learning_rate": 8.640886251265839e-07, + "loss": 0.0087, + "num_tokens": 907730077.0, + "reward": 1.5083706378936768, + "reward_std": 0.3407495617866516, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18076327443122864, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1204.4263916015625, + "completions/mean_terminated_length": 977.4022827148438, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3292312609876938, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10595729823429317, + "kl": 0.0156097412109375, + "learning_rate": 8.638488048275166e-07, + "loss": 0.0311, + "num_tokens": 908342508.0, + "reward": 1.426897406578064, + "reward_std": 0.34525033831596375, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.19482967257499695, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1026.997802734375, + "completions/mean_terminated_length": 821.7024536132812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.32944435565499974, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12195332760302954, + "kl": 0.017730712890625, + "learning_rate": 8.636088108371588e-07, + "loss": 0.1093, + "num_tokens": 908870347.0, + "reward": 1.587053656578064, + "reward_std": 0.3584185242652893, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.18620048463344574, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 906.904052734375, + "completions/mean_terminated_length": 753.794921875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.3296574503223057, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1307723429054731, + "kl": 0.020294189453125, + "learning_rate": 8.633686432883289e-07, + "loss": 0.091, + "num_tokens": 909347008.0, + "reward": 1.6238839626312256, + "reward_std": 0.26169130206108093, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16231150925159454, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1108.0023193359375, + "completions/mean_terminated_length": 848.2307739257812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.32987054498961166, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13747905334302876, + "kl": 0.018402099609375, + "learning_rate": 8.631283023139413e-07, + "loss": 0.0845, + "num_tokens": 909908129.0, + "reward": 1.4587054252624512, + "reward_std": 0.3720957338809967, + "rewards/accuracy_reward/mean": 0.5462962985038757, + "rewards/accuracy_reward/std": 0.49842923879623413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.1945772022008896, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1013.4375610351562, + "completions/mean_terminated_length": 795.340576171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33008363965691756, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1393923780844507, + "kl": 0.019775390625, + "learning_rate": 8.628877880470062e-07, + "loss": 0.1057, + "num_tokens": 910423925.0, + "reward": 1.5329241752624512, + "reward_std": 0.3651920258998871, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.21694914996623993, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1132.1942138671875, + "completions/mean_terminated_length": 785.596923828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3302967343242235, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1101738653267016, + "kl": 0.015960693359375, + "learning_rate": 8.6264710062063e-07, + "loss": 0.0912, + "num_tokens": 911001276.0, + "reward": 1.3270089626312256, + "reward_std": 0.3444467782974243, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.2043900191783905, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1040.6607666015625, + "completions/mean_terminated_length": 841.3475952148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3305098289915295, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11522031122400136, + "kl": 0.01885986328125, + "learning_rate": 8.624062401680148e-07, + "loss": 0.0973, + "num_tokens": 911540996.0, + "reward": 1.5842634439468384, + "reward_std": 0.38823845982551575, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17547869682312012, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1038.415283203125, + "completions/mean_terminated_length": 828.878662109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33072292365883543, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13906163516479314, + "kl": 0.016845703125, + "learning_rate": 8.621652068224582e-07, + "loss": 0.0784, + "num_tokens": 912079230.0, + "reward": 1.3945313692092896, + "reward_std": 0.3579113185405731, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.20285958051681519, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1086.305908203125, + "completions/mean_terminated_length": 830.940673828125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3309360183261414, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.132157395439325, + "kl": 0.018096923828125, + "learning_rate": 8.619240007173541e-07, + "loss": 0.0815, + "num_tokens": 912637815.0, + "reward": 1.4397321939468384, + "reward_std": 0.3675016164779663, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.2259417027235031, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1042.571533203125, + "completions/mean_terminated_length": 868.858642578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33114911299344735, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12823196655768887, + "kl": 0.019683837890625, + "learning_rate": 8.61682621986191e-07, + "loss": 0.1226, + "num_tokens": 913173367.0, + "reward": 1.5022321939468384, + "reward_std": 0.3428337275981903, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.19624143838882446, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1011.5357666015625, + "completions/mean_terminated_length": 772.3516845703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3313622076607533, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11278272137202248, + "kl": 0.01922607421875, + "learning_rate": 8.61441070762554e-07, + "loss": 0.0378, + "num_tokens": 913690855.0, + "reward": 1.4001116752624512, + "reward_std": 0.25777122378349304, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21654021739959717, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1036.6629638671875, + "completions/mean_terminated_length": 858.8162841796875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.33157530232805926, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11554814236829952, + "kl": 0.0181884765625, + "learning_rate": 8.611993471801232e-07, + "loss": 0.0325, + "num_tokens": 914225344.0, + "reward": 1.3521206378936768, + "reward_std": 0.2653568983078003, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.16564138233661652, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 919.8906860351562, + "completions/mean_terminated_length": 710.9814453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.33178839699536516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14870361635136725, + "kl": 0.019561767578125, + "learning_rate": 8.609574513726739e-07, + "loss": 0.1583, + "num_tokens": 914703423.0, + "reward": 1.3939732313156128, + "reward_std": 0.33611148595809937, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.21293358504772186, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1093.868408203125, + "completions/mean_terminated_length": 775.8244018554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3320014916626711, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1296739443720335, + "kl": 0.0181884765625, + "learning_rate": 8.607153834740771e-07, + "loss": 0.0868, + "num_tokens": 915264212.0, + "reward": 1.3967634439468384, + "reward_std": 0.37327054142951965, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.21386010944843292, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1032.743408203125, + "completions/mean_terminated_length": 866.6103515625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3322145863299771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13803957140549328, + "kl": 0.018585205078125, + "learning_rate": 8.604731436182988e-07, + "loss": 0.0823, + "num_tokens": 915794305.0, + "reward": 1.4949777126312256, + "reward_std": 0.34452149271965027, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18260475993156433, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1014.08935546875, + "completions/mean_terminated_length": 825.8575439453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33242768099728304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12731579006375046, + "kl": 0.020782470703125, + "learning_rate": 8.602307319394001e-07, + "loss": 0.1107, + "num_tokens": 916315113.0, + "reward": 1.4949777126312256, + "reward_std": 0.3242989778518677, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18639397621154785, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 988.79248046875, + "completions/mean_terminated_length": 802.5275268554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.332640775664589, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13914407672937937, + "kl": 0.018951416015625, + "learning_rate": 8.599881485715374e-07, + "loss": 0.0934, + "num_tokens": 916829132.0, + "reward": 1.5290179252624512, + "reward_std": 0.3423592448234558, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.18086770176887512, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1046.497802734375, + "completions/mean_terminated_length": 798.2144775390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33285387033189495, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11161299470516874, + "kl": 0.018096923828125, + "learning_rate": 8.597453936489623e-07, + "loss": 0.077, + "num_tokens": 917367595.0, + "reward": 1.4849331378936768, + "reward_std": 0.33410611748695374, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19267114996910095, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1030.1273193359375, + "completions/mean_terminated_length": 795.2335205078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3330669649992009, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13171322853169354, + "kl": 0.01971435546875, + "learning_rate": 8.595024673060204e-07, + "loss": 0.1117, + "num_tokens": 917895044.0, + "reward": 1.5156251192092896, + "reward_std": 0.3379441797733307, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.20910291373729706, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 971.24560546875, + "completions/mean_terminated_length": 817.4234619140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33328005966650687, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12838801195280714, + "kl": 0.020477294921875, + "learning_rate": 8.592593696771538e-07, + "loss": 0.0671, + "num_tokens": 918398610.0, + "reward": 1.5552456378936768, + "reward_std": 0.3218420147895813, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17538617551326752, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1021.1094360351562, + "completions/mean_terminated_length": 797.872314453125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.33349315433381277, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1198834553972838, + "kl": 0.021087646484375, + "learning_rate": 8.590161008968975e-07, + "loss": 0.0434, + "num_tokens": 918919443.0, + "reward": 1.5468751192092896, + "reward_std": 0.29237571358680725, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.16445480287075043, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1155.638427734375, + "completions/mean_terminated_length": 934.4122314453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3337062490011187, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11622894785731466, + "kl": 0.0168304443359375, + "learning_rate": 8.587726610998824e-07, + "loss": 0.0745, + "num_tokens": 919513489.0, + "reward": 1.3666294813156128, + "reward_std": 0.32794949412345886, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9135044813156128, + "rewards/tag_count_reward/std": 0.2247832715511322, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1216.8773193359375, + "completions/mean_terminated_length": 862.1942749023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3339193436684247, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12281726539302042, + "kl": 0.016082763671875, + "learning_rate": 8.585290504208341e-07, + "loss": 0.0703, + "num_tokens": 920124906.0, + "reward": 1.2712054252624512, + "reward_std": 0.33053600788116455, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.21358920633792877, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 954.0960083007812, + "completions/mean_terminated_length": 730.6102294921875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.33413243833573064, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13504144612283336, + "kl": 0.019683837890625, + "learning_rate": 8.582852689945722e-07, + "loss": 0.0947, + "num_tokens": 920622165.0, + "reward": 1.551897406578064, + "reward_std": 0.29022881388664246, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.17221127450466156, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1005.6094360351562, + "completions/mean_terminated_length": 865.7443237304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3343455330030366, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13380214733181806, + "kl": 0.019317626953125, + "learning_rate": 8.580413169560112e-07, + "loss": 0.1117, + "num_tokens": 921135782.0, + "reward": 1.5680804252624512, + "reward_std": 0.30227774381637573, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.17561113834381104, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1239.716552734375, + "completions/mean_terminated_length": 940.626953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33455862767034256, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12449077140104582, + "kl": 0.014678955078125, + "learning_rate": 8.577971944401598e-07, + "loss": 0.1119, + "num_tokens": 921755911.0, + "reward": 1.3415179252624512, + "reward_std": 0.38318243622779846, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8861607313156128, + "rewards/tag_count_reward/std": 0.26522624492645264, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1074.134033203125, + "completions/mean_terminated_length": 852.6795043945312, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.3347717223376485, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14544190800650036, + "kl": 0.016754150390625, + "learning_rate": 8.575529015821212e-07, + "loss": 0.1399, + "num_tokens": 922310067.0, + "reward": 1.395647406578064, + "reward_std": 0.3073558807373047, + "rewards/accuracy_reward/mean": 0.48379629850387573, + "rewards/accuracy_reward/std": 0.5003167986869812, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.2196313887834549, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1033.024658203125, + "completions/mean_terminated_length": 791.8978271484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33498481700495447, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.133962976415893, + "kl": 0.020050048828125, + "learning_rate": 8.573084385170927e-07, + "loss": 0.0712, + "num_tokens": 922847822.0, + "reward": 1.563616156578064, + "reward_std": 0.34568989276885986, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.18280036747455597, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1098.3013916015625, + "completions/mean_terminated_length": 907.3432006835938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.33519791167226043, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13238663547720864, + "kl": 0.01739501953125, + "learning_rate": 8.570638053803659e-07, + "loss": 0.0763, + "num_tokens": 923410213.0, + "reward": 1.4631696939468384, + "reward_std": 0.3122788965702057, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.17409753799438477, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1031.63623046875, + "completions/mean_terminated_length": 830.5374755859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33541100633956633, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1452290616321746, + "kl": 0.019439697265625, + "learning_rate": 8.568190023073265e-07, + "loss": 0.0984, + "num_tokens": 923938002.0, + "reward": 1.4960938692092896, + "reward_std": 0.3545323610305786, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9291294813156128, + "rewards/tag_count_reward/std": 0.20514829456806183, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1081.075927734375, + "completions/mean_terminated_length": 908.0474243164062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3356241010068723, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11712048381827228, + "kl": 0.017974853515625, + "learning_rate": 8.565740294334544e-07, + "loss": 0.0641, + "num_tokens": 924492500.0, + "reward": 1.5312501192092896, + "reward_std": 0.2865827977657318, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1884397715330124, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1043.075927734375, + "completions/mean_terminated_length": 850.6436157226562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.33583719567417825, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13070003789527895, + "kl": 0.020538330078125, + "learning_rate": 8.563288868943232e-07, + "loss": 0.0453, + "num_tokens": 925038982.0, + "reward": 1.5379464626312256, + "reward_std": 0.3534909188747406, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.1970413774251938, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1010.8750610351562, + "completions/mean_terminated_length": 850.4948120117188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.3360502903414842, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12501885986782266, + "kl": 0.018585205078125, + "learning_rate": 8.560835748256007e-07, + "loss": 0.0981, + "num_tokens": 925561102.0, + "reward": 1.598772406578064, + "reward_std": 0.32354316115379333, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1787492334842682, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1033.4375, + "completions/mean_terminated_length": 799.3077392578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33626338500879016, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11169122723076931, + "kl": 0.01708984375, + "learning_rate": 8.55838093363048e-07, + "loss": 0.0422, + "num_tokens": 926096178.0, + "reward": 1.4609376192092896, + "reward_std": 0.34643271565437317, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.19318637251853943, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1138.977783203125, + "completions/mean_terminated_length": 900.8394165039062, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.3364764796760961, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11359512424902131, + "kl": 0.0154266357421875, + "learning_rate": 8.555924426425209e-07, + "loss": 0.0612, + "num_tokens": 926674664.0, + "reward": 1.481584906578064, + "reward_std": 0.3570943772792816, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.1951945573091507, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1124.700927734375, + "completions/mean_terminated_length": 911.6318969726562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3366895743434021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12104951916944742, + "kl": 0.017547607421875, + "learning_rate": 8.553466227999675e-07, + "loss": 0.0544, + "num_tokens": 927253618.0, + "reward": 1.4587054252624512, + "reward_std": 0.3643728792667389, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.20681875944137573, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1041.7054443359375, + "completions/mean_terminated_length": 855.3544921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.33690266901070803, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12576589918029712, + "kl": 0.020172119140625, + "learning_rate": 8.551006339714308e-07, + "loss": 0.0497, + "num_tokens": 927787726.0, + "reward": 1.5496652126312256, + "reward_std": 0.34323054552078247, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295229911804, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.18222837150096893, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 976.0826416015625, + "completions/mean_terminated_length": 803.9093017578125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.33711576367801394, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.131544968487818, + "kl": 0.01971435546875, + "learning_rate": 8.548544762930469e-07, + "loss": 0.1391, + "num_tokens": 928285747.0, + "reward": 1.4140626192092896, + "reward_std": 0.35008323192596436, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.20490244030952454, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1060.6763916015625, + "completions/mean_terminated_length": 826.1188354492188, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.3373288583453199, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1192012317907316, + "kl": 0.01849365234375, + "learning_rate": 8.546081499010449e-07, + "loss": 0.0902, + "num_tokens": 928825474.0, + "reward": 1.4977679252624512, + "reward_std": 0.366095632314682, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20075708627700806, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1057.930908203125, + "completions/mean_terminated_length": 862.0347900390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.33754195301262585, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10822808637642121, + "kl": 0.017669677734375, + "learning_rate": 8.543616549317475e-07, + "loss": 0.0508, + "num_tokens": 929366435.0, + "reward": 1.4291294813156128, + "reward_std": 0.2421514391899109, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.16769373416900635, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1055.171875, + "completions/mean_terminated_length": 842.6151733398438, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.3377550476799318, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12156121799100028, + "kl": 0.01904296875, + "learning_rate": 8.541149915215711e-07, + "loss": 0.061, + "num_tokens": 929906784.0, + "reward": 1.571428656578064, + "reward_std": 0.30008241534233093, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.20923420786857605, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1114.7991943359375, + "completions/mean_terminated_length": 873.6348266601562, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.33796814234723777, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12698244792293686, + "kl": 0.018463134765625, + "learning_rate": 8.538681598070248e-07, + "loss": 0.0797, + "num_tokens": 930476662.0, + "reward": 1.5106027126312256, + "reward_std": 0.32206472754478455, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.20484988391399384, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1074.54248046875, + "completions/mean_terminated_length": 878.8070068359375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3381812370145437, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.234242915234687, + "kl": 0.0255126953125, + "learning_rate": 8.536211599247114e-07, + "loss": 0.1117, + "num_tokens": 931036441.0, + "reward": 1.3454241752624512, + "reward_std": 0.2859877347946167, + "rewards/accuracy_reward/mean": 0.43518519401550293, + "rewards/accuracy_reward/std": 0.4963560700416565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2126835286617279, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1158.9085693359375, + "completions/mean_terminated_length": 913.2051391601562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3383943316818497, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12134259593881014, + "kl": 0.0159912109375, + "learning_rate": 8.53373992011326e-07, + "loss": 0.08, + "num_tokens": 931617584.0, + "reward": 1.3861607313156128, + "reward_std": 0.3382651209831238, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.21056649088859558, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1014.4241333007812, + "completions/mean_terminated_length": 803.263427734375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.33860742634915564, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.9260785552464367, + "kl": 0.124237060546875, + "learning_rate": 8.531266562036576e-07, + "loss": 0.0619, + "num_tokens": 932150094.0, + "reward": 1.4888393878936768, + "reward_std": 0.3059941530227661, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.194285050034523, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1115.732177734375, + "completions/mean_terminated_length": 900.5934448242188, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.33882052101646154, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11824290626565354, + "kl": 0.01837158203125, + "learning_rate": 8.528791526385871e-07, + "loss": 0.0451, + "num_tokens": 932719062.0, + "reward": 1.4648438692092896, + "reward_std": 0.2916608452796936, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19759102165699005, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1055.044677734375, + "completions/mean_terminated_length": 883.4869384765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3390336156837675, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12374272301341897, + "kl": 0.01995849609375, + "learning_rate": 8.526314814530892e-07, + "loss": 0.0726, + "num_tokens": 933263114.0, + "reward": 1.4994419813156128, + "reward_std": 0.2921116054058075, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17547869682312012, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1082.8013916015625, + "completions/mean_terminated_length": 866.5546264648438, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.33924671035107345, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.34648282663667723, + "kl": 0.03204345703125, + "learning_rate": 8.523836427842306e-07, + "loss": 0.1089, + "num_tokens": 933821537.0, + "reward": 1.4598214626312256, + "reward_std": 0.33889105916023254, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18674945831298828, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1096.1607666015625, + "completions/mean_terminated_length": 907.8289184570312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3394598050183794, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.332481468693029, + "kl": 0.02239990234375, + "learning_rate": 8.521356367691713e-07, + "loss": 0.0718, + "num_tokens": 934379849.0, + "reward": 1.4843751192092896, + "reward_std": 0.3701212704181671, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.19500340521335602, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 951.0826416015625, + "completions/mean_terminated_length": 771.5869750976562, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.33967289968568537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12518180098565113, + "kl": 0.018768310546875, + "learning_rate": 8.518874635451635e-07, + "loss": 0.0514, + "num_tokens": 934871598.0, + "reward": 1.6171876192092896, + "reward_std": 0.3468630909919739, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1567714661359787, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1188.2679443359375, + "completions/mean_terminated_length": 915.176513671875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3398859943529913, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14599288429524476, + "kl": 0.017181396484375, + "learning_rate": 8.516391232495522e-07, + "loss": 0.0933, + "num_tokens": 935476070.0, + "reward": 1.2873884439468384, + "reward_std": 0.36599746346473694, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9079241156578064, + "rewards/tag_count_reward/std": 0.24354930222034454, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1187.5045166015625, + "completions/mean_terminated_length": 940.2356567382812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3400990890202973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12111080847835187, + "kl": 0.0151824951171875, + "learning_rate": 8.513906160197748e-07, + "loss": 0.1035, + "num_tokens": 936080808.0, + "reward": 1.328125, + "reward_std": 0.35253384709358215, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.4903542101383209, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.2222987711429596, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1010.2410888671875, + "completions/mean_terminated_length": 843.5543823242188, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.34031218368760324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13419886506615158, + "kl": 0.017974853515625, + "learning_rate": 8.511419419933606e-07, + "loss": 0.1251, + "num_tokens": 936596660.0, + "reward": 1.5368304252624512, + "reward_std": 0.3319348990917206, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17943671345710754, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1148.415283203125, + "completions/mean_terminated_length": 928.5166625976562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.34052527835490914, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10581830156494289, + "kl": 0.0153656005859375, + "learning_rate": 8.508931013079322e-07, + "loss": 0.042, + "num_tokens": 937177262.0, + "reward": 1.344866156578064, + "reward_std": 0.37663909792900085, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134778022766, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9341517686843872, + "rewards/tag_count_reward/std": 0.20580217242240906, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1068.32373046875, + "completions/mean_terminated_length": 835.5828857421875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3407383730222151, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10760653346547906, + "kl": 0.016357421875, + "learning_rate": 8.506440941012037e-07, + "loss": 0.0583, + "num_tokens": 937733695.0, + "reward": 1.5139509439468384, + "reward_std": 0.33474764227867126, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.1653849333524704, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 978.76123046875, + "completions/mean_terminated_length": 770.615966796875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.34095146768952106, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12376230938122178, + "kl": 0.0196533203125, + "learning_rate": 8.503949205109813e-07, + "loss": 0.0755, + "num_tokens": 938238756.0, + "reward": 1.501116156578064, + "reward_std": 0.342297226190567, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19029554724693298, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 995.888427734375, + "completions/mean_terminated_length": 787.7166137695312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.341164562356827, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12540608643126913, + "kl": 0.021453857421875, + "learning_rate": 8.501455806751638e-07, + "loss": 0.0431, + "num_tokens": 938749938.0, + "reward": 1.544084906578064, + "reward_std": 0.36465775966644287, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.18413659930229187, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1142.1585693359375, + "completions/mean_terminated_length": 878.49853515625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.341377657024133, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12874172540854856, + "kl": 0.016998291015625, + "learning_rate": 8.498960747317416e-07, + "loss": 0.0915, + "num_tokens": 939326281.0, + "reward": 1.325334906578064, + "reward_std": 0.30375322699546814, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18336889147758484, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1016.4107666015625, + "completions/mean_terminated_length": 853.808837890625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.34159075169143893, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12827477825000136, + "kl": 0.01812744140625, + "learning_rate": 8.496464028187969e-07, + "loss": 0.0836, + "num_tokens": 939854353.0, + "reward": 1.4743304252624512, + "reward_std": 0.3306964635848999, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.20006877183914185, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1177.107177734375, + "completions/mean_terminated_length": 900.4705810546875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3418038463587449, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11432165066473485, + "kl": 0.0154876708984375, + "learning_rate": 8.493965650745043e-07, + "loss": 0.0696, + "num_tokens": 940446081.0, + "reward": 1.3303571939468384, + "reward_std": 0.34219592809677124, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.1949649602174759, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1133.685302734375, + "completions/mean_terminated_length": 922.6895751953125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.34201694102605085, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11167500738734885, + "kl": 0.0158538818359375, + "learning_rate": 8.491465616371299e-07, + "loss": 0.0726, + "num_tokens": 941029508.0, + "reward": 1.4704241752624512, + "reward_std": 0.3550918400287628, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.17041130363941193, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 999.4464721679688, + "completions/mean_terminated_length": 798.6595458984375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.34223003569335675, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12552361769465137, + "kl": 0.01715087890625, + "learning_rate": 8.488963926450313e-07, + "loss": 0.0742, + "num_tokens": 941550700.0, + "reward": 1.5630581378936768, + "reward_std": 0.33897992968559265, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.18274828791618347, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1021.3817138671875, + "completions/mean_terminated_length": 821.5333251953125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.3424431303606627, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11984445150945572, + "kl": 0.017578125, + "learning_rate": 8.48646058236658e-07, + "loss": 0.0874, + "num_tokens": 942085079.0, + "reward": 1.5736607313156128, + "reward_std": 0.23639696836471558, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.15519075095653534, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 940.0178833007812, + "completions/mean_terminated_length": 758.711669921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.34265622502796866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1395942109562022, + "kl": 0.02020263671875, + "learning_rate": 8.483955585505507e-07, + "loss": 0.1327, + "num_tokens": 942573631.0, + "reward": 1.5725446939468384, + "reward_std": 0.3420998752117157, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18730680644512177, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1209.430908203125, + "completions/mean_terminated_length": 933.2255249023438, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3428693196952746, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11163256036739098, + "kl": 0.0169219970703125, + "learning_rate": 8.48144893725342e-07, + "loss": 0.0921, + "num_tokens": 943187008.0, + "reward": 1.450334906578064, + "reward_std": 0.356228232383728, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.2339489907026291, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1088.6942138671875, + "completions/mean_terminated_length": 833.9632568359375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3430824143625806, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12462191398307557, + "kl": 0.0165863037109375, + "learning_rate": 8.478940638997558e-07, + "loss": 0.0651, + "num_tokens": 943746711.0, + "reward": 1.3945313692092896, + "reward_std": 0.2974564731121063, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.1695888340473175, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1024.5023193359375, + "completions/mean_terminated_length": 825.2613525390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.34329550902988654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14180287976384584, + "kl": 0.018768310546875, + "learning_rate": 8.47643069212607e-07, + "loss": 0.129, + "num_tokens": 944280568.0, + "reward": 1.4592634439468384, + "reward_std": 0.4094885587692261, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1929948478937149, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1156.529052734375, + "completions/mean_terminated_length": 913.4005737304688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3435086036971925, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11693392577947447, + "kl": 0.016357421875, + "learning_rate": 8.473919098028021e-07, + "loss": 0.0557, + "num_tokens": 944869189.0, + "reward": 1.3415179252624512, + "reward_std": 0.2694803476333618, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.17691786587238312, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1170.77685546875, + "completions/mean_terminated_length": 902.2390747070312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.34372169836449845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12202034254249197, + "kl": 0.0159454345703125, + "learning_rate": 8.471405858093385e-07, + "loss": 0.1094, + "num_tokens": 945461777.0, + "reward": 1.3777902126312256, + "reward_std": 0.3514552414417267, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9224330186843872, + "rewards/tag_count_reward/std": 0.21863438189029694, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 966.763427734375, + "completions/mean_terminated_length": 786.5573120117188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.34393479303180435, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13679831943531562, + "kl": 0.01873779296875, + "learning_rate": 8.468890973713048e-07, + "loss": 0.0488, + "num_tokens": 945960535.0, + "reward": 1.469866156578064, + "reward_std": 0.3222181499004364, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.1764904409646988, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1059.203125, + "completions/mean_terminated_length": 860.3834228515625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3441478876991103, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12369640650993691, + "kl": 0.0165557861328125, + "learning_rate": 8.466374446278806e-07, + "loss": 0.0944, + "num_tokens": 946501458.0, + "reward": 1.4720982313156128, + "reward_std": 0.29216450452804565, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.165329247713089, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1114.435302734375, + "completions/mean_terminated_length": 882.994384765625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.34436098236641627, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12614838333385603, + "kl": 0.018585205078125, + "learning_rate": 8.463856277183366e-07, + "loss": 0.0976, + "num_tokens": 947062917.0, + "reward": 1.4877232313156128, + "reward_std": 0.3302699625492096, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.20703594386577606, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1202.9598388671875, + "completions/mean_terminated_length": 900.7938842773438, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3445740770337222, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10659987049189806, + "kl": 0.016021728515625, + "learning_rate": 8.461336467820339e-07, + "loss": 0.044, + "num_tokens": 947669523.0, + "reward": 1.3582589626312256, + "reward_std": 0.36030831933021545, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.20681875944137573, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1181.1295166015625, + "completions/mean_terminated_length": 915.760986328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3447871717010282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11785801678981878, + "kl": 0.016998291015625, + "learning_rate": 8.458815019584247e-07, + "loss": 0.0947, + "num_tokens": 948272365.0, + "reward": 1.3247768878936768, + "reward_std": 0.3536304831504822, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.2192423790693283, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1076.9285888671875, + "completions/mean_terminated_length": 865.8261108398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.34500026636833414, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.31550087920220626, + "kl": 0.027740478515625, + "learning_rate": 8.456291933870521e-07, + "loss": 0.088, + "num_tokens": 948830349.0, + "reward": 1.3984376192092896, + "reward_std": 0.31362342834472656, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18730680644512177, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 974.622802734375, + "completions/mean_terminated_length": 818.145751953125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.3452133610356401, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12709461279539422, + "kl": 0.018341064453125, + "learning_rate": 8.453767212075495e-07, + "loss": 0.0471, + "num_tokens": 949327844.0, + "reward": 1.5719866752624512, + "reward_std": 0.3165784478187561, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.17984239757061005, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1078.185302734375, + "completions/mean_terminated_length": 841.1194458007812, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.34542645570294606, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11790167650458931, + "kl": 0.01812744140625, + "learning_rate": 8.451240855596409e-07, + "loss": 0.0298, + "num_tokens": 949876791.0, + "reward": 1.4603794813156128, + "reward_std": 0.32075875997543335, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20976383984088898, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1006.0178833007812, + "completions/mean_terminated_length": 765.5604858398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.34563955037025196, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13012699424904645, + "kl": 0.01873779296875, + "learning_rate": 8.448712865831405e-07, + "loss": 0.0877, + "num_tokens": 950393359.0, + "reward": 1.4559152126312256, + "reward_std": 0.2959991991519928, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.19338902831077576, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 972.1116333007812, + "completions/mean_terminated_length": 812.1077270507812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3458526450375579, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11659833126967319, + "kl": 0.01934814453125, + "learning_rate": 8.446183244179537e-07, + "loss": 0.0722, + "num_tokens": 950897457.0, + "reward": 1.5580357313156128, + "reward_std": 0.3262375593185425, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.16415086388587952, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1201.3795166015625, + "completions/mean_terminated_length": 964.32568359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3460657397048639, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11849761053249776, + "kl": 0.0146636962890625, + "learning_rate": 8.443651992040754e-07, + "loss": 0.0797, + "num_tokens": 951507995.0, + "reward": 1.4095982313156128, + "reward_std": 0.33879438042640686, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.1686781942844391, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 994.7879638671875, + "completions/mean_terminated_length": 838.1564331054688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.34627883437216983, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11993875597034388, + "kl": 0.01995849609375, + "learning_rate": 8.441119110815911e-07, + "loss": 0.0465, + "num_tokens": 952027740.0, + "reward": 1.4531251192092896, + "reward_std": 0.3466882109642029, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.18240725994110107, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 964.8616333007812, + "completions/mean_terminated_length": 794.1343994140625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3464919290394758, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14036370193297334, + "kl": 0.02081298828125, + "learning_rate": 8.438584601906763e-07, + "loss": 0.0972, + "num_tokens": 952533726.0, + "reward": 1.4905134439468384, + "reward_std": 0.27920711040496826, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.17873525619506836, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 987.6629638671875, + "completions/mean_terminated_length": 807.710205078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.34670502370678175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16755385757276825, + "kl": 0.023956298828125, + "learning_rate": 8.436048466715968e-07, + "loss": 0.087, + "num_tokens": 953049479.0, + "reward": 1.5234376192092896, + "reward_std": 0.3229178786277771, + "rewards/accuracy_reward/mean": 0.6087962985038757, + "rewards/accuracy_reward/std": 0.4885856807231903, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19961901009082794, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 992.9688110351562, + "completions/mean_terminated_length": 817.1302490234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3469181183740877, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13232075696645468, + "kl": 0.021514892578125, + "learning_rate": 8.433510706647082e-07, + "loss": 0.0593, + "num_tokens": 953555241.0, + "reward": 1.438616156578064, + "reward_std": 0.32919150590896606, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.1796870082616806, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1112.4107666015625, + "completions/mean_terminated_length": 833.0899047851562, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.34713121304139366, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1288337679320564, + "kl": 0.0172119140625, + "learning_rate": 8.43097132310456e-07, + "loss": 0.1148, + "num_tokens": 954124049.0, + "reward": 1.4252232313156128, + "reward_std": 0.39024457335472107, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9073660969734192, + "rewards/tag_count_reward/std": 0.2436242550611496, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 940.5670166015625, + "completions/mean_terminated_length": 788.7868041992188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.34734430770869956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14651077284284583, + "kl": 0.02044677734375, + "learning_rate": 8.428430317493758e-07, + "loss": 0.1327, + "num_tokens": 954616527.0, + "reward": 1.4955357313156128, + "reward_std": 0.3442634642124176, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.18987849354743958, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1108.390625, + "completions/mean_terminated_length": 888.3718872070312, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3475574023760055, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.114830226473819, + "kl": 0.0158538818359375, + "learning_rate": 8.425887691220927e-07, + "loss": 0.0743, + "num_tokens": 955185582.0, + "reward": 1.524553656578064, + "reward_std": 0.38950827717781067, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17145662009716034, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 979.99560546875, + "completions/mean_terminated_length": 722.609375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3477704970433115, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12826343656744987, + "kl": 0.021820068359375, + "learning_rate": 8.423343445693217e-07, + "loss": 0.0996, + "num_tokens": 955694972.0, + "reward": 1.594866156578064, + "reward_std": 0.3238920569419861, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1879189908504486, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1118.935302734375, + "completions/mean_terminated_length": 907.6685180664062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.34798359171061743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12611364433179229, + "kl": 0.017791748046875, + "learning_rate": 8.420797582318672e-07, + "loss": 0.0917, + "num_tokens": 956278367.0, + "reward": 1.4603794813156128, + "reward_std": 0.30211108922958374, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.22847945988178253, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1056.9241943359375, + "completions/mean_terminated_length": 828.2142944335938, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3481966863779234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13416065233687174, + "kl": 0.018402099609375, + "learning_rate": 8.418250102506235e-07, + "loss": 0.0843, + "num_tokens": 956818829.0, + "reward": 1.4408482313156128, + "reward_std": 0.33602064847946167, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18046346306800842, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1151.872802734375, + "completions/mean_terminated_length": 894.3649291992188, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.34840978104522935, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10417369791164842, + "kl": 0.015472412109375, + "learning_rate": 8.415701007665738e-07, + "loss": 0.0473, + "num_tokens": 957414372.0, + "reward": 1.5368304252624512, + "reward_std": 0.32307910919189453, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18642495572566986, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 974.3906860351562, + "completions/mean_terminated_length": 817.8797607421875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3486228757125353, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1435312812324333, + "kl": 0.019500732421875, + "learning_rate": 8.41315029920791e-07, + "loss": 0.0753, + "num_tokens": 957914643.0, + "reward": 1.5033482313156128, + "reward_std": 0.324045866727829, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1649361401796341, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1107.341552734375, + "completions/mean_terminated_length": 893.4383544921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.34883597037984126, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11185849895635559, + "kl": 0.01837158203125, + "learning_rate": 8.410597978544375e-07, + "loss": 0.1215, + "num_tokens": 958481404.0, + "reward": 1.4726563692092896, + "reward_std": 0.35467612743377686, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.21838873624801636, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 1000.310302734375, + "completions/mean_terminated_length": 799.6887817382812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3490490650471472, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12394316537549191, + "kl": 0.01983642578125, + "learning_rate": 8.408044047087647e-07, + "loss": 0.0865, + "num_tokens": 958990087.0, + "reward": 1.5195313692092896, + "reward_std": 0.2774677574634552, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.16506759822368622, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1031.7991943359375, + "completions/mean_terminated_length": 807.5149536132812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3492621597144531, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13214700511177013, + "kl": 0.016876220703125, + "learning_rate": 8.405488506251131e-07, + "loss": 0.1135, + "num_tokens": 959519341.0, + "reward": 1.5000001192092896, + "reward_std": 0.34166568517684937, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1928403526544571, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 956.6160888671875, + "completions/mean_terminated_length": 781.3160400390625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3494752543817591, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1455037968751677, + "kl": 0.02197265625, + "learning_rate": 8.402931357449121e-07, + "loss": 0.0907, + "num_tokens": 960015841.0, + "reward": 1.4860491752624512, + "reward_std": 0.29433223605155945, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.162506565451622, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 996.9888916015625, + "completions/mean_terminated_length": 775.42431640625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.34968834904906504, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12017265066066854, + "kl": 0.016632080078125, + "learning_rate": 8.400372602096807e-07, + "loss": 0.067, + "num_tokens": 960524332.0, + "reward": 1.567522406578064, + "reward_std": 0.3173326849937439, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15405942499637604, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1063.5023193359375, + "completions/mean_terminated_length": 859.1724853515625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.349901443716371, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12225836227539241, + "kl": 0.018890380859375, + "learning_rate": 8.397812241610261e-07, + "loss": 0.0656, + "num_tokens": 961069213.0, + "reward": 1.4687501192092896, + "reward_std": 0.3443000912666321, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.19134558737277985, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 1042.122802734375, + "completions/mean_terminated_length": 782.177001953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35011453838367695, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1278527087777152, + "kl": 0.01812744140625, + "learning_rate": 8.395250277406448e-07, + "loss": 0.1023, + "num_tokens": 961609684.0, + "reward": 1.4095982313156128, + "reward_std": 0.3001745641231537, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17418356239795685, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 957.08935546875, + "completions/mean_terminated_length": 794.851318359375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.3503276330509829, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12985906862358204, + "kl": 0.018646240234375, + "learning_rate": 8.392686710903221e-07, + "loss": 0.0373, + "num_tokens": 962107820.0, + "reward": 1.5390626192092896, + "reward_std": 0.24022845923900604, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.14819122850894928, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1110.919677734375, + "completions/mean_terminated_length": 802.2670288085938, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.35054072771828887, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.48295840111489974, + "kl": 0.025054931640625, + "learning_rate": 8.390121543519313e-07, + "loss": 0.1008, + "num_tokens": 962675768.0, + "reward": 1.4681919813156128, + "reward_std": 0.31022509932518005, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.17285525798797607, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 822.3683471679688, + "completions/mean_terminated_length": 712.0316162109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3507538223855948, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1413350541777721, + "kl": 0.020111083984375, + "learning_rate": 8.387554776674352e-07, + "loss": 0.1297, + "num_tokens": 963114381.0, + "reward": 1.7181921005249023, + "reward_std": 0.3322814702987671, + "rewards/accuracy_reward/mean": 0.7544642686843872, + "rewards/accuracy_reward/std": 0.43088552355766296, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.1416827142238617, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 971.44873046875, + "completions/mean_terminated_length": 801.7597045898438, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.35096691705290073, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12906559349788407, + "kl": 0.0170745849609375, + "learning_rate": 8.384986411788846e-07, + "loss": 0.0665, + "num_tokens": 963621590.0, + "reward": 1.5256696939468384, + "reward_std": 0.32936400175094604, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.12144903838634491, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1183.0826416015625, + "completions/mean_terminated_length": 950.314453125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3511800117202067, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11762234638480969, + "kl": 0.014984130859375, + "learning_rate": 8.382416450284186e-07, + "loss": 0.0504, + "num_tokens": 964220331.0, + "reward": 1.4051339626312256, + "reward_std": 0.3253902494907379, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17050378024578094, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1008.7098388671875, + "completions/mean_terminated_length": 806.3946533203125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.35139310638751264, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1324159762240552, + "kl": 0.01788330078125, + "learning_rate": 8.379844893582653e-07, + "loss": 0.1185, + "num_tokens": 964734841.0, + "reward": 1.4001116752624512, + "reward_std": 0.3016037940979004, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.17976601421833038, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1136.5982666015625, + "completions/mean_terminated_length": 861.0581665039062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3516062010548186, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11354181730812309, + "kl": 0.0147705078125, + "learning_rate": 8.377271743107403e-07, + "loss": 0.0811, + "num_tokens": 965325669.0, + "reward": 1.4213169813156128, + "reward_std": 0.33992189168930054, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18230371177196503, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 968.9620971679688, + "completions/mean_terminated_length": 792.3922119140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35181929572212456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2179572642136493, + "kl": 0.02227783203125, + "learning_rate": 8.374697000282481e-07, + "loss": 0.065, + "num_tokens": 965828292.0, + "reward": 1.4095982313156128, + "reward_std": 0.33641761541366577, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.20094045996665955, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1018.8638916015625, + "completions/mean_terminated_length": 808.6102294921875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3520323903894305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12762655299513068, + "kl": 0.0177001953125, + "learning_rate": 8.372120666532808e-07, + "loss": 0.0698, + "num_tokens": 966350871.0, + "reward": 1.5468751192092896, + "reward_std": 0.3452845513820648, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.14789186418056488, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 967.5357666015625, + "completions/mean_terminated_length": 816.32568359375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.3522454850567365, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.125028850924469, + "kl": 0.0203857421875, + "learning_rate": 8.36954274328419e-07, + "loss": 0.0189, + "num_tokens": 966852951.0, + "reward": 1.4949777126312256, + "reward_std": 0.33132684230804443, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.17236346006393433, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1154.560302734375, + "completions/mean_terminated_length": 874.2140502929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.35245857972404243, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12200378824755592, + "kl": 0.01568603515625, + "learning_rate": 8.366963231963306e-07, + "loss": 0.0864, + "num_tokens": 967441506.0, + "reward": 1.3470982313156128, + "reward_std": 0.2944601774215698, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.19318638741970062, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 893.2031860351562, + "completions/mean_terminated_length": 711.180908203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35267167439134833, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1441753126497566, + "kl": 0.02227783203125, + "learning_rate": 8.364382133997722e-07, + "loss": 0.1076, + "num_tokens": 967906077.0, + "reward": 1.508928656578064, + "reward_std": 0.3505012094974518, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20760497450828552, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1043.3929443359375, + "completions/mean_terminated_length": 847.829345703125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3528847690586543, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12115994017717166, + "kl": 0.019378662109375, + "learning_rate": 8.361799450815875e-07, + "loss": 0.0811, + "num_tokens": 968446749.0, + "reward": 1.5680804252624512, + "reward_std": 0.31966859102249146, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.18280036747455597, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1042.484375, + "completions/mean_terminated_length": 803.60498046875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.35309786372596025, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13321623980124786, + "kl": 0.01904296875, + "learning_rate": 8.359215183847086e-07, + "loss": 0.0663, + "num_tokens": 968978134.0, + "reward": 1.5005581378936768, + "reward_std": 0.2845189571380615, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.1692424863576889, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1129.29248046875, + "completions/mean_terminated_length": 861.8875732421875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3533109583932662, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13073354791989578, + "kl": 0.016571044921875, + "learning_rate": 8.356629334521545e-07, + "loss": 0.0521, + "num_tokens": 969561689.0, + "reward": 1.3052456378936768, + "reward_std": 0.3249323070049286, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.22644878923892975, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 987.4464721679688, + "completions/mean_terminated_length": 807.4569091796875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.35352405306057216, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13731001616266328, + "kl": 0.0172882080078125, + "learning_rate": 8.354041904270324e-07, + "loss": 0.1203, + "num_tokens": 970076385.0, + "reward": 1.5106027126312256, + "reward_std": 0.3532072603702545, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.2085641622543335, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1098.640625, + "completions/mean_terminated_length": 829.338134765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3537371477278781, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1409123868168773, + "kl": 0.0172119140625, + "learning_rate": 8.351452894525368e-07, + "loss": 0.1014, + "num_tokens": 970635440.0, + "reward": 1.3867188692092896, + "reward_std": 0.3475736975669861, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9246651530265808, + "rewards/tag_count_reward/std": 0.22131875157356262, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 971.0938110351562, + "completions/mean_terminated_length": 785.0314331054688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3539502423951841, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13482737552091728, + "kl": 0.018646240234375, + "learning_rate": 8.348862306719495e-07, + "loss": 0.1056, + "num_tokens": 971141210.0, + "reward": 1.5055804252624512, + "reward_std": 0.3174375295639038, + "rewards/accuracy_reward/mean": 0.5902777910232544, + "rewards/accuracy_reward/std": 0.49235257506370544, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.17736537754535675, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1097.1451416015625, + "completions/mean_terminated_length": 871.2514038085938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.35416333706249004, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12153666244557827, + "kl": 0.01702880859375, + "learning_rate": 8.346270142286397e-07, + "loss": 0.088, + "num_tokens": 971703307.0, + "reward": 1.3431919813156128, + "reward_std": 0.3357907831668854, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.16183683276176453, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1014.1004638671875, + "completions/mean_terminated_length": 772.0027465820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35437643172979594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12967449414124, + "kl": 0.019866943359375, + "learning_rate": 8.343676402660638e-07, + "loss": 0.081, + "num_tokens": 972224392.0, + "reward": 1.536272406578064, + "reward_std": 0.3587360978126526, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.19454751908779144, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1022.0625610351562, + "completions/mean_terminated_length": 884.4050903320312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3545895263971019, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11471894006231183, + "kl": 0.0185546875, + "learning_rate": 8.341081089277655e-07, + "loss": 0.049, + "num_tokens": 972753876.0, + "reward": 1.4285714626312256, + "reward_std": 0.36653241515159607, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.18907469511032104, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1111.622802734375, + "completions/mean_terminated_length": 882.7305908203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35480262106440785, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13530110386335292, + "kl": 0.01776123046875, + "learning_rate": 8.338484203573756e-07, + "loss": 0.0968, + "num_tokens": 973325467.0, + "reward": 1.3510044813156128, + "reward_std": 0.3596004843711853, + "rewards/accuracy_reward/mean": 0.49038460850715637, + "rewards/accuracy_reward/std": 0.50050950050354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8956473469734192, + "rewards/tag_count_reward/std": 0.25165674090385437, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1180.540283203125, + "completions/mean_terminated_length": 887.934326171875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3550157157317138, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1116113699141581, + "kl": 0.0158538818359375, + "learning_rate": 8.335885746986118e-07, + "loss": 0.0546, + "num_tokens": 973926493.0, + "reward": 1.208147406578064, + "reward_std": 0.32724109292030334, + "rewards/accuracy_reward/mean": 0.2901785671710968, + "rewards/accuracy_reward/std": 0.4543520212173462, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.22438858449459076, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1017.5714721679688, + "completions/mean_terminated_length": 796.9647827148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35522881039901977, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13657784015413618, + "kl": 0.01800537109375, + "learning_rate": 8.333285720952787e-07, + "loss": 0.0385, + "num_tokens": 974448253.0, + "reward": 1.4068081378936768, + "reward_std": 0.3253634572029114, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.17820364236831665, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1078.9732666015625, + "completions/mean_terminated_length": 848.762451171875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3554419050663257, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11625420033079578, + "kl": 0.01666259765625, + "learning_rate": 8.330684126912679e-07, + "loss": 0.0719, + "num_tokens": 974993665.0, + "reward": 1.5848214626312256, + "reward_std": 0.3165508806705475, + "rewards/accuracy_reward/mean": 0.6629464030265808, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.20557457208633423, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1159.466552734375, + "completions/mean_terminated_length": 887.4664916992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3556549997336317, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1171302782261919, + "kl": 0.0158233642578125, + "learning_rate": 8.328080966305577e-07, + "loss": 0.0569, + "num_tokens": 975587218.0, + "reward": 1.4012277126312256, + "reward_std": 0.3639538884162903, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.18292580544948578, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1000.7567138671875, + "completions/mean_terminated_length": 779.9865112304688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.35586809440093764, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13116914093618792, + "kl": 0.018829345703125, + "learning_rate": 8.325476240572131e-07, + "loss": 0.0852, + "num_tokens": 976104261.0, + "reward": 1.4754464626312256, + "reward_std": 0.3803035318851471, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.22219766676425934, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1208.125, + "completions/mean_terminated_length": 904.3404541015625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.35608118906824354, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12044837001720093, + "kl": 0.0179901123046875, + "learning_rate": 8.322869951153859e-07, + "loss": 0.0858, + "num_tokens": 976719853.0, + "reward": 1.3537946939468384, + "reward_std": 0.31160521507263184, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22028762102127075, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 948.872802734375, + "completions/mean_terminated_length": 772.3289794921875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3562942837355495, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3304440682812741, + "kl": 0.0545654296875, + "learning_rate": 8.32026209949314e-07, + "loss": 0.0757, + "num_tokens": 977210500.0, + "reward": 1.5669643878936768, + "reward_std": 0.3016761243343353, + "rewards/accuracy_reward/mean": 0.6759259104728699, + "rewards/accuracy_reward/std": 0.4685704708099365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.2324453443288803, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1168.4554443359375, + "completions/mean_terminated_length": 835.58154296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35650737840285546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14941878690052934, + "kl": 0.01751708984375, + "learning_rate": 8.317652687033223e-07, + "loss": 0.0628, + "num_tokens": 977807296.0, + "reward": 1.3297991752624512, + "reward_std": 0.42319467663764954, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9168526530265808, + "rewards/tag_count_reward/std": 0.22850678861141205, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1031.2835693359375, + "completions/mean_terminated_length": 823.5671997070312, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3567204730701614, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12458043412497996, + "kl": 0.01824951171875, + "learning_rate": 8.315041715218216e-07, + "loss": 0.084, + "num_tokens": 978340655.0, + "reward": 1.5491071939468384, + "reward_std": 0.3681546747684479, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.19925907254219055, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1069.8795166015625, + "completions/mean_terminated_length": 876.3475952148438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.35693356773746737, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12201868729627982, + "kl": 0.01983642578125, + "learning_rate": 8.312429185493091e-07, + "loss": 0.0659, + "num_tokens": 978886969.0, + "reward": 1.450334906578064, + "reward_std": 0.3468681573867798, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.18704918026924133, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 988.46435546875, + "completions/mean_terminated_length": 821.4573974609375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.35714666240477333, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13102302072404803, + "kl": 0.01837158203125, + "learning_rate": 8.309815099303687e-07, + "loss": 0.0715, + "num_tokens": 979397161.0, + "reward": 1.5262277126312256, + "reward_std": 0.26391923427581787, + "rewards/accuracy_reward/mean": 0.6105769276618958, + "rewards/accuracy_reward/std": 0.4882066547870636, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15467405319213867, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1124.634033203125, + "completions/mean_terminated_length": 895.721435546875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3573597570720793, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11683748049961222, + "kl": 0.01702880859375, + "learning_rate": 8.307199458096699e-07, + "loss": 0.0589, + "num_tokens": 979972085.0, + "reward": 1.4654018878936768, + "reward_std": 0.37644821405410767, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.207614004611969, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1164.34375, + "completions/mean_terminated_length": 916.9199829101562, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.35757285173938524, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12329211528085454, + "kl": 0.017242431640625, + "learning_rate": 8.304582263319683e-07, + "loss": 0.0367, + "num_tokens": 980570479.0, + "reward": 1.3392857313156128, + "reward_std": 0.3130727708339691, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.19925905764102936, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1121.1116943359375, + "completions/mean_terminated_length": 884.845947265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35778594640669115, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12438145003668408, + "kl": 0.0177001953125, + "learning_rate": 8.301963516421055e-07, + "loss": 0.0615, + "num_tokens": 981139537.0, + "reward": 1.5239956378936768, + "reward_std": 0.25324687361717224, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.1899302750825882, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1063.4732666015625, + "completions/mean_terminated_length": 836.2747192382812, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3579990410739971, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13310002736889023, + "kl": 0.018310546875, + "learning_rate": 8.299343218850094e-07, + "loss": 0.0865, + "num_tokens": 981680629.0, + "reward": 1.5145089626312256, + "reward_std": 0.34273067116737366, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.2021544873714447, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1086.01123046875, + "completions/mean_terminated_length": 864.0137329101562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.35821213574130306, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11457330950328799, + "kl": 0.018951416015625, + "learning_rate": 8.296721372056933e-07, + "loss": 0.0597, + "num_tokens": 982238202.0, + "reward": 1.4944196939468384, + "reward_std": 0.3844994902610779, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.20643207430839539, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1095.4129638671875, + "completions/mean_terminated_length": 900.7984008789062, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.358425230408609, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11919204974769292, + "kl": 0.018768310546875, + "learning_rate": 8.294097977492564e-07, + "loss": 0.0808, + "num_tokens": 982792051.0, + "reward": 1.5424107313156128, + "reward_std": 0.37556812167167664, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19573186337947845, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1067.4910888671875, + "completions/mean_terminated_length": 888.9815673828125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.358638325075915, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12468285425006967, + "kl": 0.01904296875, + "learning_rate": 8.291473036608834e-07, + "loss": 0.1023, + "num_tokens": 983338127.0, + "reward": 1.5446429252624512, + "reward_std": 0.33349454402923584, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19357694685459137, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1051.602783203125, + "completions/mean_terminated_length": 818.2864990234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35885141974322093, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12461767786802058, + "kl": 0.019989013671875, + "learning_rate": 8.288846550858446e-07, + "loss": 0.0632, + "num_tokens": 983873693.0, + "reward": 1.583147406578064, + "reward_std": 0.3163088858127594, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16086193919181824, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1087.5513916015625, + "completions/mean_terminated_length": 872.3688354492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3590645144105269, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12588477808091145, + "kl": 0.01849365234375, + "learning_rate": 8.286218521694961e-07, + "loss": 0.1261, + "num_tokens": 984434724.0, + "reward": 1.4514509439468384, + "reward_std": 0.3659247159957886, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.20658548176288605, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1075.4554443359375, + "completions/mean_terminated_length": 803.142822265625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.35927760907783285, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13375983502899372, + "kl": 0.01934814453125, + "learning_rate": 8.283588950572791e-07, + "loss": 0.1389, + "num_tokens": 984987904.0, + "reward": 1.4335938692092896, + "reward_std": 0.34458714723587036, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.24595172703266144, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 984.7522583007812, + "completions/mean_terminated_length": 767.529541015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35949070374513875, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11898711831861983, + "kl": 0.01959228515625, + "learning_rate": 8.280957838947204e-07, + "loss": 0.0767, + "num_tokens": 985495457.0, + "reward": 1.5329241752624512, + "reward_std": 0.28722572326660156, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.1839466691017151, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1081.9754638671875, + "completions/mean_terminated_length": 855.7713623046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3597037984124447, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.126314108261914, + "kl": 0.017425537109375, + "learning_rate": 8.278325188274316e-07, + "loss": 0.0915, + "num_tokens": 986048118.0, + "reward": 1.4280134439468384, + "reward_std": 0.3145117163658142, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.18318496644496918, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1067.1629638671875, + "completions/mean_terminated_length": 866.77685546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.35991689307975067, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13189044116987939, + "kl": 0.020355224609375, + "learning_rate": 8.275691000011098e-07, + "loss": 0.0928, + "num_tokens": 986594319.0, + "reward": 1.559709906578064, + "reward_std": 0.2899355888366699, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.15902772545814514, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1011.482177734375, + "completions/mean_terminated_length": 789.5718383789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3601299877470566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18035025318004214, + "kl": 0.022674560546875, + "learning_rate": 8.273055275615374e-07, + "loss": 0.1362, + "num_tokens": 987115751.0, + "reward": 1.4871652126312256, + "reward_std": 0.3944976031780243, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20367643237113953, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1078.352783203125, + "completions/mean_terminated_length": 892.6754760742188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3603430824143626, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1190019355143982, + "kl": 0.017578125, + "learning_rate": 8.270418016545812e-07, + "loss": 0.102, + "num_tokens": 987662101.0, + "reward": 1.5463169813156128, + "reward_std": 0.362278014421463, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2172366976737976, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1040.546875, + "completions/mean_terminated_length": 828.1648559570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36055617708166854, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1212077769971808, + "kl": 0.017608642578125, + "learning_rate": 8.267779224261939e-07, + "loss": 0.0307, + "num_tokens": 988201546.0, + "reward": 1.5541294813156128, + "reward_std": 0.2592855393886566, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1395251452922821, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1076.669677734375, + "completions/mean_terminated_length": 829.0756225585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3607692717489745, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1239379273978354, + "kl": 0.01617431640625, + "learning_rate": 8.265138900224117e-07, + "loss": 0.0371, + "num_tokens": 988753222.0, + "reward": 1.4614956378936768, + "reward_std": 0.3413955867290497, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.1752651333808899, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1032.5357666015625, + "completions/mean_terminated_length": 828.3539428710938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.36098236641628045, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13141921902148235, + "kl": 0.018280029296875, + "learning_rate": 8.262497045893569e-07, + "loss": 0.0771, + "num_tokens": 989296230.0, + "reward": 1.5072544813156128, + "reward_std": 0.3216952085494995, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.1573467254638672, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1176.7054443359375, + "completions/mean_terminated_length": 879.3173828125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.36119546108358636, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11074433099547726, + "kl": 0.016693115234375, + "learning_rate": 8.259853662732358e-07, + "loss": 0.0801, + "num_tokens": 989893906.0, + "reward": 1.3850446939468384, + "reward_std": 0.3302210867404938, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18940123915672302, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1162.078125, + "completions/mean_terminated_length": 877.2241821289062, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3614085557508923, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11806739774367307, + "kl": 0.01617431640625, + "learning_rate": 8.257208752203392e-07, + "loss": 0.0752, + "num_tokens": 990479589.0, + "reward": 1.4330357313156128, + "reward_std": 0.28618568181991577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14133092761039734, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1082.4263916015625, + "completions/mean_terminated_length": 836.2997436523438, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.36162165041819827, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1273603523245905, + "kl": 0.017120361328125, + "learning_rate": 8.254562315770428e-07, + "loss": 0.0899, + "num_tokens": 991027796.0, + "reward": 1.5083706378936768, + "reward_std": 0.35290732979774475, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.18088065087795258, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1055.5804443359375, + "completions/mean_terminated_length": 849.6064453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.36183474508550423, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1321313240164551, + "kl": 0.0201416015625, + "learning_rate": 8.251914354898067e-07, + "loss": 0.1121, + "num_tokens": 991567784.0, + "reward": 1.4704241752624512, + "reward_std": 0.3289593458175659, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.18413659930229187, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1060.546875, + "completions/mean_terminated_length": 832.673095703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3620478397528102, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13345264592004155, + "kl": 0.0173492431640625, + "learning_rate": 8.249264871051751e-07, + "loss": 0.0804, + "num_tokens": 992116365.0, + "reward": 1.4129464626312256, + "reward_std": 0.36945098638534546, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.16981780529022217, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 977.51123046875, + "completions/mean_terminated_length": 827.6972045898438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.36226093442011614, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 9.959927533064949, + "kl": 0.44854736328125, + "learning_rate": 8.246613865697767e-07, + "loss": 0.0959, + "num_tokens": 992620642.0, + "reward": 1.594866156578064, + "reward_std": 0.2843775153160095, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17070865631103516, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1151.84375, + "completions/mean_terminated_length": 853.125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.3624740290874221, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11214373283501447, + "kl": 0.0160675048828125, + "learning_rate": 8.243961340303245e-07, + "loss": 0.0532, + "num_tokens": 993214620.0, + "reward": 1.3989956378936768, + "reward_std": 0.3131371736526489, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9213169813156128, + "rewards/tag_count_reward/std": 0.2294771820306778, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1057.7076416015625, + "completions/mean_terminated_length": 864.9306640625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.36268712375472806, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12338856575292699, + "kl": 0.0188140869140625, + "learning_rate": 8.241307296336151e-07, + "loss": 0.1174, + "num_tokens": 993761689.0, + "reward": 1.5373884439468384, + "reward_std": 0.38775938749313354, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17854657769203186, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1098.8348388671875, + "completions/mean_terminated_length": 853.5449829101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.362900218422034, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1312230965107301, + "kl": 0.017120361328125, + "learning_rate": 8.238651735265298e-07, + "loss": 0.092, + "num_tokens": 994324191.0, + "reward": 1.4235491752624512, + "reward_std": 0.3866247534751892, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.19661563634872437, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 971.4844360351562, + "completions/mean_terminated_length": 792.0651245117188, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3631133130893399, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1186374845999787, + "kl": 0.02142333984375, + "learning_rate": 8.235994658560338e-07, + "loss": 0.0522, + "num_tokens": 994832360.0, + "reward": 1.5145089626312256, + "reward_std": 0.2828209102153778, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18046344816684723, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 912.2188110351562, + "completions/mean_terminated_length": 749.9642944335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3633264077566459, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11737759474799048, + "kl": 0.021728515625, + "learning_rate": 8.233336067691755e-07, + "loss": 0.0756, + "num_tokens": 995308314.0, + "reward": 1.6032366752624512, + "reward_std": 0.32066306471824646, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1649162620306015, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1143.41748046875, + "completions/mean_terminated_length": 912.8375854492188, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.36353950242395183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11893022266779026, + "kl": 0.0166778564453125, + "learning_rate": 8.230675964130879e-07, + "loss": 0.0462, + "num_tokens": 995884501.0, + "reward": 1.4709821939468384, + "reward_std": 0.3447098135948181, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.16584569215774536, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1055.575927734375, + "completions/mean_terminated_length": 881.0551147460938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3637525970912578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11985776270875176, + "kl": 0.017333984375, + "learning_rate": 8.228014349349872e-07, + "loss": 0.1165, + "num_tokens": 996430519.0, + "reward": 1.5770089626312256, + "reward_std": 0.36568358540534973, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1748131364583969, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1238.6295166015625, + "completions/mean_terminated_length": 942.5182495117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36396569175856375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11076538293855635, + "kl": 0.0143890380859375, + "learning_rate": 8.225351224821733e-07, + "loss": 0.0518, + "num_tokens": 997061777.0, + "reward": 1.4123884439468384, + "reward_std": 0.3476875424385071, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.20359058678150177, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1001.6339721679688, + "completions/mean_terminated_length": 763.6931762695312, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3641787864258697, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1212053662061018, + "kl": 0.019683837890625, + "learning_rate": 8.222686592020298e-07, + "loss": 0.0839, + "num_tokens": 997577533.0, + "reward": 1.4827009439468384, + "reward_std": 0.3263048231601715, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.1681026816368103, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1066.13623046875, + "completions/mean_terminated_length": 832.875732421875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.36439188109317566, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11763812531412161, + "kl": 0.01885986328125, + "learning_rate": 8.220020452420241e-07, + "loss": 0.0693, + "num_tokens": 998121146.0, + "reward": 1.4603794813156128, + "reward_std": 0.31841525435447693, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.17382751405239105, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1117.091552734375, + "completions/mean_terminated_length": 879.8011474609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3646049757604816, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12729011810206955, + "kl": 0.0177001953125, + "learning_rate": 8.217352807497062e-07, + "loss": 0.0991, + "num_tokens": 998689651.0, + "reward": 1.5524554252624512, + "reward_std": 0.3326716125011444, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18032504618167877, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1031.5535888671875, + "completions/mean_terminated_length": 783.0889282226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3648180704277875, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2462535373934271, + "kl": 0.0220947265625, + "learning_rate": 8.2146836587271e-07, + "loss": 0.0574, + "num_tokens": 999220683.0, + "reward": 1.5189732313156128, + "reward_std": 0.28955236077308655, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.1880783587694168, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1064.977783203125, + "completions/mean_terminated_length": 817.84912109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3650311650950935, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11847887395497089, + "kl": 0.0179290771484375, + "learning_rate": 8.212013007587524e-07, + "loss": 0.0395, + "num_tokens": 999766369.0, + "reward": 1.5189732313156128, + "reward_std": 0.37035804986953735, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16426870226860046, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1127.96435546875, + "completions/mean_terminated_length": 918.750732421875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.36524425976239944, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11784266280503311, + "kl": 0.01812744140625, + "learning_rate": 8.209340855556336e-07, + "loss": 0.0813, + "num_tokens": 1000335969.0, + "reward": 1.3900669813156128, + "reward_std": 0.33505529165267944, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.20008359849452972, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1093.328125, + "completions/mean_terminated_length": 839.8276977539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3654573544297054, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11780569090551464, + "kl": 0.017791748046875, + "learning_rate": 8.206667204112366e-07, + "loss": 0.0652, + "num_tokens": 1000895284.0, + "reward": 1.4118304252624512, + "reward_std": 0.3445415794849396, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18040810525417328, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1115.279052734375, + "completions/mean_terminated_length": 900.0357666015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36567044909701135, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11439541125913491, + "kl": 0.01837158203125, + "learning_rate": 8.203992054735276e-07, + "loss": 0.0753, + "num_tokens": 1001465809.0, + "reward": 1.4475446939468384, + "reward_std": 0.36878278851509094, + "rewards/accuracy_reward/mean": 0.5532407164573669, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2332361787557602, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1121.8304443359375, + "completions/mean_terminated_length": 824.035400390625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3658835437643173, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1198601269318102, + "kl": 0.0179290771484375, + "learning_rate": 8.201315408905557e-07, + "loss": 0.1124, + "num_tokens": 1002040053.0, + "reward": 1.4430804252624512, + "reward_std": 0.32946091890335083, + "rewards/accuracy_reward/mean": 0.5578703880310059, + "rewards/accuracy_reward/std": 0.49721553921699524, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9051339030265808, + "rewards/tag_count_reward/std": 0.23752164840698242, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1089.118408203125, + "completions/mean_terminated_length": 831.0623168945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36609663843162327, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13048665007835744, + "kl": 0.019439697265625, + "learning_rate": 8.198637268104528e-07, + "loss": 0.0799, + "num_tokens": 1002594218.0, + "reward": 1.5345982313156128, + "reward_std": 0.36409419775009155, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22654591500759125, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1164.3616943359375, + "completions/mean_terminated_length": 920.1652221679688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3663097330989292, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.125473116194484, + "kl": 0.0157012939453125, + "learning_rate": 8.195957633814333e-07, + "loss": 0.0494, + "num_tokens": 1003192492.0, + "reward": 1.3152902126312256, + "reward_std": 0.351897656917572, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.2159051150083542, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1134.8170166015625, + "completions/mean_terminated_length": 933.2697143554688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3665228277662351, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11461254218935943, + "kl": 0.020355224609375, + "learning_rate": 8.193276507517946e-07, + "loss": 0.0582, + "num_tokens": 1003765658.0, + "reward": 1.4849331378936768, + "reward_std": 0.33629924058914185, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.18820028007030487, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1082.915283203125, + "completions/mean_terminated_length": 840.2960815429688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3667359224335411, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12126027271747143, + "kl": 0.018096923828125, + "learning_rate": 8.190593890699165e-07, + "loss": 0.0625, + "num_tokens": 1004315316.0, + "reward": 1.4720982313156128, + "reward_std": 0.32021471858024597, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920448660850525, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.1661728024482727, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1124.5692138671875, + "completions/mean_terminated_length": 920.7601928710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36694901710084704, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11582499865810182, + "kl": 0.01727294921875, + "learning_rate": 8.187909784842612e-07, + "loss": 0.0508, + "num_tokens": 1004891891.0, + "reward": 1.3515626192092896, + "reward_std": 0.28647276759147644, + "rewards/accuracy_reward/mean": 0.41203704476356506, + "rewards/accuracy_reward/std": 0.4927724003791809, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1625574380159378, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1146.5692138671875, + "completions/mean_terminated_length": 846.09228515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.367162111768153, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12354799408436021, + "kl": 0.0159149169921875, + "learning_rate": 8.185224191433738e-07, + "loss": 0.0794, + "num_tokens": 1005473714.0, + "reward": 1.372209906578064, + "reward_std": 0.33468201756477356, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17627368867397308, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1107.3504638671875, + "completions/mean_terminated_length": 843.9685668945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36737520643545896, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12316917774905219, + "kl": 0.018524169921875, + "learning_rate": 8.182537111958807e-07, + "loss": 0.0876, + "num_tokens": 1006036127.0, + "reward": 1.4196429252624512, + "reward_std": 0.36146289110183716, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.23967991769313812, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1018.7857666015625, + "completions/mean_terminated_length": 798.4390258789062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3675883011027649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13143077450399732, + "kl": 0.0181884765625, + "learning_rate": 8.179848547904916e-07, + "loss": 0.1183, + "num_tokens": 1006558303.0, + "reward": 1.3850446939468384, + "reward_std": 0.3376026749610901, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16994640231132507, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 987.27685546875, + "completions/mean_terminated_length": 810.4896240234375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.36780139577007087, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1332942382157942, + "kl": 0.020782470703125, + "learning_rate": 8.177158500759979e-07, + "loss": 0.0697, + "num_tokens": 1007066363.0, + "reward": 1.5535714626312256, + "reward_std": 0.33747413754463196, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1639530062675476, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1073.477783203125, + "completions/mean_terminated_length": 841.9613647460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.36801449043737683, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1217144579086224, + "kl": 0.0186767578125, + "learning_rate": 8.174466972012731e-07, + "loss": 0.0807, + "num_tokens": 1007620433.0, + "reward": 1.4988839626312256, + "reward_std": 0.30491694808006287, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16578169167041779, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1027.8348388671875, + "completions/mean_terminated_length": 809.4254760742188, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.36822758510468273, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12255101277995828, + "kl": 0.020050048828125, + "learning_rate": 8.171773963152728e-07, + "loss": 0.1073, + "num_tokens": 1008144727.0, + "reward": 1.5223214626312256, + "reward_std": 0.3487630784511566, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.2026386857032776, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1037.109375, + "completions/mean_terminated_length": 859.3411865234375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.3684406797719887, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13037602065967266, + "kl": 0.01898193359375, + "learning_rate": 8.169079475670342e-07, + "loss": 0.0905, + "num_tokens": 1008679160.0, + "reward": 1.469866156578064, + "reward_std": 0.37427228689193726, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.21069389581680298, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 918.7879638671875, + "completions/mean_terminated_length": 801.972900390625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.36865377443929465, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13342643249824107, + "kl": 0.020782470703125, + "learning_rate": 8.166383511056767e-07, + "loss": 0.1379, + "num_tokens": 1009157369.0, + "reward": 1.661272406578064, + "reward_std": 0.34895122051239014, + "rewards/accuracy_reward/mean": 0.7232142686843872, + "rewards/accuracy_reward/std": 0.44790980219841003, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19121423363685608, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1138.040283203125, + "completions/mean_terminated_length": 879.9140625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3688668691066006, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11566171433185084, + "kl": 0.01776123046875, + "learning_rate": 8.163686070804013e-07, + "loss": 0.0872, + "num_tokens": 1009742107.0, + "reward": 1.5212054252624512, + "reward_std": 0.31370410323143005, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.2049998939037323, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 982.1004638671875, + "completions/mean_terminated_length": 771.2005615234375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.36907996377390656, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13021953861724425, + "kl": 0.019744873046875, + "learning_rate": 8.160987156404907e-07, + "loss": 0.0556, + "num_tokens": 1010255512.0, + "reward": 1.5351563692092896, + "reward_std": 0.28683602809906006, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17025740444660187, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1093.0648193359375, + "completions/mean_terminated_length": 839.4943237304688, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3692930584412125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12509578018928036, + "kl": 0.01806640625, + "learning_rate": 8.158286769353091e-07, + "loss": 0.0747, + "num_tokens": 1010808821.0, + "reward": 1.411272406578064, + "reward_std": 0.3108561635017395, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.1990012526512146, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1122.375, + "completions/mean_terminated_length": 852.9567260742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3695061531085185, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.54854668617899, + "kl": 0.026611328125, + "learning_rate": 8.155584911143022e-07, + "loss": 0.0888, + "num_tokens": 1011383677.0, + "reward": 1.3950893878936768, + "reward_std": 0.348760724067688, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.21133582293987274, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 980.3058471679688, + "completions/mean_terminated_length": 772.4613037109375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.36971924777582443, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14154967962596304, + "kl": 0.021697998046875, + "learning_rate": 8.152881583269973e-07, + "loss": 0.0817, + "num_tokens": 1011891190.0, + "reward": 1.5446429252624512, + "reward_std": 0.34302085638046265, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.18769630789756775, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 979.9308471679688, + "completions/mean_terminated_length": 811.578857421875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.36993234244313034, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11742019174956134, + "kl": 0.01983642578125, + "learning_rate": 8.15017678723003e-07, + "loss": 0.0509, + "num_tokens": 1012394775.0, + "reward": 1.5122768878936768, + "reward_std": 0.27459022402763367, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.13944123685359955, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1066.28125, + "completions/mean_terminated_length": 794.9800415039062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3701454371104363, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12609670433142364, + "kl": 0.01702880859375, + "learning_rate": 8.147470524520086e-07, + "loss": 0.0533, + "num_tokens": 1012935653.0, + "reward": 1.5574777126312256, + "reward_std": 0.2559746205806732, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16314294934272766, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1154.747802734375, + "completions/mean_terminated_length": 891.4190673828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37035853177774225, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1144801754671953, + "kl": 0.014862060546875, + "learning_rate": 8.144762796637854e-07, + "loss": 0.0837, + "num_tokens": 1013520980.0, + "reward": 1.450334906578064, + "reward_std": 0.3751393258571625, + "rewards/accuracy_reward/mean": 0.5439814925193787, + "rewards/accuracy_reward/std": 0.4986393451690674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.2093706578016281, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1042.341552734375, + "completions/mean_terminated_length": 813.6575317382812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3705716264450482, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1241329890380246, + "kl": 0.018646240234375, + "learning_rate": 8.142053605081854e-07, + "loss": 0.1473, + "num_tokens": 1014060285.0, + "reward": 1.5234376192092896, + "reward_std": 0.38116154074668884, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.21293358504772186, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1047.7254638671875, + "completions/mean_terminated_length": 771.2962646484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37078472111235417, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12626987222787797, + "kl": 0.017822265625, + "learning_rate": 8.139342951351415e-07, + "loss": 0.0839, + "num_tokens": 1014597410.0, + "reward": 1.4787946939468384, + "reward_std": 0.31804928183555603, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.20568081736564636, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1035.421875, + "completions/mean_terminated_length": 881.8432006835938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3709978157796601, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12744372520311079, + "kl": 0.020355224609375, + "learning_rate": 8.136630836946678e-07, + "loss": 0.0897, + "num_tokens": 1015127711.0, + "reward": 1.4838169813156128, + "reward_std": 0.3630126714706421, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.18937736749649048, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1036.828125, + "completions/mean_terminated_length": 810.2813720703125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.3712109104469661, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13076871779363503, + "kl": 0.018035888671875, + "learning_rate": 8.133917263368589e-07, + "loss": 0.1002, + "num_tokens": 1015654162.0, + "reward": 1.5641741752624512, + "reward_std": 0.325775146484375, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.1878882646560669, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 937.57373046875, + "completions/mean_terminated_length": 785.3832397460938, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.37142400511427204, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1272379782179284, + "kl": 0.021240234375, + "learning_rate": 8.131202232118904e-07, + "loss": 0.0782, + "num_tokens": 1016138051.0, + "reward": 1.555803656578064, + "reward_std": 0.3745437562465668, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19418221712112427, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1017.1808471679688, + "completions/mean_terminated_length": 799.8729858398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37163709978157794, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1263719184175237, + "kl": 0.01885986328125, + "learning_rate": 8.128485744700185e-07, + "loss": 0.0727, + "num_tokens": 1016659316.0, + "reward": 1.3498884439468384, + "reward_std": 0.2841840088367462, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18106688559055328, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1123.607177734375, + "completions/mean_terminated_length": 864.7771606445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3718501944488839, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12534276295743713, + "kl": 0.0172271728515625, + "learning_rate": 8.125767802615799e-07, + "loss": 0.0854, + "num_tokens": 1017236628.0, + "reward": 1.4363839626312256, + "reward_std": 0.33974677324295044, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.16381210088729858, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1098.946533203125, + "completions/mean_terminated_length": 801.1495361328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37206328911618985, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1336970587515128, + "kl": 0.018218994140625, + "learning_rate": 8.123048407369921e-07, + "loss": 0.0671, + "num_tokens": 1017796236.0, + "reward": 1.3928571939468384, + "reward_std": 0.3050023019313812, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1854480504989624, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1012.044677734375, + "completions/mean_terminated_length": 816.9442749023438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3722763837834958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13348417502947832, + "kl": 0.020294189453125, + "learning_rate": 8.120327560467526e-07, + "loss": 0.0751, + "num_tokens": 1018326224.0, + "reward": 1.512834906578064, + "reward_std": 0.3099156320095062, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.1567346155643463, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1015.9531860351562, + "completions/mean_terminated_length": 801.7546997070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.37248947845080177, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11863971452607153, + "kl": 0.018096923828125, + "learning_rate": 8.117605263414395e-07, + "loss": 0.0569, + "num_tokens": 1018847691.0, + "reward": 1.4910714626312256, + "reward_std": 0.30305105447769165, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1605055183172226, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1074.3973388671875, + "completions/mean_terminated_length": 843.0994873046875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3727025731181077, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12955659592522656, + "kl": 0.018402099609375, + "learning_rate": 8.114881517717112e-07, + "loss": 0.1153, + "num_tokens": 1019398797.0, + "reward": 1.4687501192092896, + "reward_std": 0.32842057943344116, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.17017030715942383, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1083.2679443359375, + "completions/mean_terminated_length": 809.6046142578125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3729156677854137, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11362491590559314, + "kl": 0.018585205078125, + "learning_rate": 8.112156324883059e-07, + "loss": 0.0587, + "num_tokens": 1019950517.0, + "reward": 1.430803656578064, + "reward_std": 0.31503814458847046, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.15939722955226898, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 934.60498046875, + "completions/mean_terminated_length": 735.3658447265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37312876245271964, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11995131213257423, + "kl": 0.023345947265625, + "learning_rate": 8.109429686420426e-07, + "loss": 0.0503, + "num_tokens": 1020435444.0, + "reward": 1.5312501192092896, + "reward_std": 0.2615625262260437, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1744592934846878, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1097.6942138671875, + "completions/mean_terminated_length": 871.9309692382812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37334185712002554, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11842552045973728, + "kl": 0.017669677734375, + "learning_rate": 8.106701603838194e-07, + "loss": 0.0911, + "num_tokens": 1020996347.0, + "reward": 1.505022406578064, + "reward_std": 0.3941715359687805, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.1933954805135727, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1043.40185546875, + "completions/mean_terminated_length": 831.6216430664062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3735549517873315, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12450296153295778, + "kl": 0.0186767578125, + "learning_rate": 8.103972078646154e-07, + "loss": 0.0964, + "num_tokens": 1021531039.0, + "reward": 1.5094866752624512, + "reward_std": 0.332633912563324, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.17906324565410614, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1093.60498046875, + "completions/mean_terminated_length": 815.8126831054688, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.37376804645463746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12825384476421609, + "kl": 0.016571044921875, + "learning_rate": 8.101241112354883e-07, + "loss": 0.1036, + "num_tokens": 1022092990.0, + "reward": 1.5217634439468384, + "reward_std": 0.296195387840271, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19803908467292786, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1103.544677734375, + "completions/mean_terminated_length": 872.6777954101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3739811411219434, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12777015148978366, + "kl": 0.01751708984375, + "learning_rate": 8.098508706475765e-07, + "loss": 0.1008, + "num_tokens": 1022657666.0, + "reward": 1.407366156578064, + "reward_std": 0.29539307951927185, + "rewards/accuracy_reward/mean": 0.4722222089767456, + "rewards/accuracy_reward/std": 0.49980661273002625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.15574882924556732, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1117.899658203125, + "completions/mean_terminated_length": 785.3181762695312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3741942357892494, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12685884231120742, + "kl": 0.018280029296875, + "learning_rate": 8.095774862520977e-07, + "loss": 0.0921, + "num_tokens": 1023236517.0, + "reward": 1.5078126192092896, + "reward_std": 0.3112878203392029, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.19175945222377777, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1050.962158203125, + "completions/mean_terminated_length": 853.6871948242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37440733045655533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13002677944413424, + "kl": 0.018218994140625, + "learning_rate": 8.093039582003491e-07, + "loss": 0.1004, + "num_tokens": 1023789252.0, + "reward": 1.5837054252624512, + "reward_std": 0.4018021523952484, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.16650304198265076, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1064.953125, + "completions/mean_terminated_length": 803.9180908203125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3746204251238613, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12625153920667695, + "kl": 0.017120361328125, + "learning_rate": 8.090302866437076e-07, + "loss": 0.0547, + "num_tokens": 1024341375.0, + "reward": 1.493303656578064, + "reward_std": 0.28441575169563293, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1484815776348114, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 981.8125610351562, + "completions/mean_terminated_length": 810.5595703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.37483351979116725, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12246241365577787, + "kl": 0.017913818359375, + "learning_rate": 8.087564717336298e-07, + "loss": 0.0649, + "num_tokens": 1024847915.0, + "reward": 1.5396206378936768, + "reward_std": 0.3043603301048279, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.1386905312538147, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1034.609375, + "completions/mean_terminated_length": 820.9757080078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3750466144584732, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12421330127733124, + "kl": 0.018280029296875, + "learning_rate": 8.084825136216509e-07, + "loss": 0.0452, + "num_tokens": 1025378604.0, + "reward": 1.5111607313156128, + "reward_std": 0.3387458026409149, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1799820363521576, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1133.828125, + "completions/mean_terminated_length": 874.5072021484375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3752597091257791, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12217365953217148, + "kl": 0.0155487060546875, + "learning_rate": 8.082084124593858e-07, + "loss": 0.0812, + "num_tokens": 1025974015.0, + "reward": 1.3482143878936768, + "reward_std": 0.3762272596359253, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17145662009716034, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1093.13623046875, + "completions/mean_terminated_length": 800.8309326171875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.37547280379308506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13513073747325874, + "kl": 0.017974853515625, + "learning_rate": 8.079341683985286e-07, + "loss": 0.0554, + "num_tokens": 1026535356.0, + "reward": 1.2767857313156128, + "reward_std": 0.35951194167137146, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20875635743141174, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1003.1808471679688, + "completions/mean_terminated_length": 762.0687255859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.375685898460391, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1292557515257498, + "kl": 0.01934814453125, + "learning_rate": 8.076597815908526e-07, + "loss": 0.0965, + "num_tokens": 1027056573.0, + "reward": 1.4804688692092896, + "reward_std": 0.2993229627609253, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.15527822077274323, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1207.3817138671875, + "completions/mean_terminated_length": 896.3272705078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.375898993127697, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11442972564283374, + "kl": 0.0172576904296875, + "learning_rate": 8.073852521882093e-07, + "loss": 0.0516, + "num_tokens": 1027675464.0, + "reward": 1.3521206378936768, + "reward_std": 0.2922036349773407, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.1747300922870636, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1137.9107666015625, + "completions/mean_terminated_length": 918.5816650390625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.37611208779500294, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1196506210088312, + "kl": 0.0179443359375, + "learning_rate": 8.071105803425302e-07, + "loss": 0.0777, + "num_tokens": 1028254464.0, + "reward": 1.4157366752624512, + "reward_std": 0.32972899079322815, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.18215985596179962, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1006.3928833007812, + "completions/mean_terminated_length": 776.5013427734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3763251824623089, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12549337895206544, + "kl": 0.01904296875, + "learning_rate": 8.068357662058251e-07, + "loss": 0.0687, + "num_tokens": 1028769824.0, + "reward": 1.4559152126312256, + "reward_std": 0.28184282779693604, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.19548854231834412, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1048.625, + "completions/mean_terminated_length": 837.9459838867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37653827712961485, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11980850805485095, + "kl": 0.01800537109375, + "learning_rate": 8.065608099301824e-07, + "loss": 0.0789, + "num_tokens": 1029313912.0, + "reward": 1.5066964626312256, + "reward_std": 0.36328262090682983, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18833374977111816, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1048.435302734375, + "completions/mean_terminated_length": 869.5658569335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3767513717969208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11918970749984055, + "kl": 0.018707275390625, + "learning_rate": 8.062857116677696e-07, + "loss": 0.0511, + "num_tokens": 1029850747.0, + "reward": 1.6194196939468384, + "reward_std": 0.39367207884788513, + "rewards/accuracy_reward/mean": 0.6763392686843872, + "rewards/accuracy_reward/std": 0.46839532256126404, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.16921022534370422, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 988.2098388671875, + "completions/mean_terminated_length": 849.0454711914062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3769644664642267, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13006634861517039, + "kl": 0.020416259765625, + "learning_rate": 8.060104715708322e-07, + "loss": 0.1084, + "num_tokens": 1030349465.0, + "reward": 1.5786831378936768, + "reward_std": 0.383155882358551, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.1969708651304245, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1111.921875, + "completions/mean_terminated_length": 839.4610595703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37717756113153267, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11957988756810889, + "kl": 0.0169830322265625, + "learning_rate": 8.057350897916948e-07, + "loss": 0.0369, + "num_tokens": 1030923846.0, + "reward": 1.3532366752624512, + "reward_std": 0.31791016459465027, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19697721302509308, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1052.7210693359375, + "completions/mean_terminated_length": 823.041259765625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3773906557988386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13231535328403388, + "kl": 0.019866943359375, + "learning_rate": 8.0545956648276e-07, + "loss": 0.135, + "num_tokens": 1031464841.0, + "reward": 1.4994419813156128, + "reward_std": 0.41839471459388733, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19874386489391327, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1066.1763916015625, + "completions/mean_terminated_length": 798.40625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3776037504661446, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1196844932436743, + "kl": 0.017303466796875, + "learning_rate": 8.051839017965091e-07, + "loss": 0.1029, + "num_tokens": 1032013064.0, + "reward": 1.4910714626312256, + "reward_std": 0.3079325556755066, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1675233542919159, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 951.716552734375, + "completions/mean_terminated_length": 772.3246459960938, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.37781684513345054, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14280700810925132, + "kl": 0.02117919921875, + "learning_rate": 8.049080958855012e-07, + "loss": 0.1123, + "num_tokens": 1032508329.0, + "reward": 1.6350446939468384, + "reward_std": 0.3310209810733795, + "rewards/accuracy_reward/mean": 0.6897321343421936, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.17798370122909546, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1104.872802734375, + "completions/mean_terminated_length": 864.4678344726562, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3780299398007565, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11952240844526296, + "kl": 0.0157928466796875, + "learning_rate": 8.046321489023736e-07, + "loss": 0.112, + "num_tokens": 1033074560.0, + "reward": 1.4235491752624512, + "reward_std": 0.3836788535118103, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18981850147247314, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1031.58935546875, + "completions/mean_terminated_length": 807.2588500976562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.37824303446806246, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13835621319224825, + "kl": 0.01910400390625, + "learning_rate": 8.04356060999842e-07, + "loss": 0.0514, + "num_tokens": 1033608920.0, + "reward": 1.3861607313156128, + "reward_std": 0.32100725173950195, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.19211392104625702, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1014.372802734375, + "completions/mean_terminated_length": 819.7108764648438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.3784561291353684, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13152905571988127, + "kl": 0.020721435546875, + "learning_rate": 8.040798323307e-07, + "loss": 0.0717, + "num_tokens": 1034128031.0, + "reward": 1.5072544813156128, + "reward_std": 0.26950329542160034, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16235284507274628, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1041.685302734375, + "completions/mean_terminated_length": 816.2267456054688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3786692238026743, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11790811939762134, + "kl": 0.017822265625, + "learning_rate": 8.038034630478191e-07, + "loss": 0.0842, + "num_tokens": 1034671650.0, + "reward": 1.4469866752624512, + "reward_std": 0.36048850417137146, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.2086060494184494, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1081.8460693359375, + "completions/mean_terminated_length": 835.5714721679688, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.3788823184699803, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12387912469036946, + "kl": 0.017364501953125, + "learning_rate": 8.035269533041483e-07, + "loss": 0.0842, + "num_tokens": 1035225437.0, + "reward": 1.4564732313156128, + "reward_std": 0.31012076139450073, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17213605344295502, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1108.6763916015625, + "completions/mean_terminated_length": 806.6519165039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.37909541313728623, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13060316284638238, + "kl": 0.018096923828125, + "learning_rate": 8.032503032527148e-07, + "loss": 0.088, + "num_tokens": 1035789020.0, + "reward": 1.407366156578064, + "reward_std": 0.32878005504608154, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18340036273002625, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1092.6741943359375, + "completions/mean_terminated_length": 868.9752197265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3793085078045922, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11450227013372614, + "kl": 0.01593017578125, + "learning_rate": 8.029735130466227e-07, + "loss": 0.0938, + "num_tokens": 1036344090.0, + "reward": 1.383928656578064, + "reward_std": 0.3981441259384155, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.23556096851825714, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1003.5803833007812, + "completions/mean_terminated_length": 803.5850830078125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.37952160247189815, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1315771784202522, + "kl": 0.0176239013671875, + "learning_rate": 8.026965828390549e-07, + "loss": 0.0913, + "num_tokens": 1036856990.0, + "reward": 1.5345982313156128, + "reward_std": 0.34747010469436646, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17929750680923462, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1107.712158203125, + "completions/mean_terminated_length": 834.0259399414062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3797346971392041, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11092434396023591, + "kl": 0.0178680419921875, + "learning_rate": 8.024195127832708e-07, + "loss": 0.043, + "num_tokens": 1037428749.0, + "reward": 1.3727679252624512, + "reward_std": 0.29710641503334045, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19927160441875458, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1082.07373046875, + "completions/mean_terminated_length": 804.5086059570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.37994779180651006, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12035328090706476, + "kl": 0.018280029296875, + "learning_rate": 8.021423030326075e-07, + "loss": 0.0778, + "num_tokens": 1037981166.0, + "reward": 1.3180804252624512, + "reward_std": 0.3196057379245758, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.20377831161022186, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 997.05810546875, + "completions/mean_terminated_length": 825.085693359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.380160886473816, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12269569712608709, + "kl": 0.019775390625, + "learning_rate": 8.018649537404791e-07, + "loss": 0.0886, + "num_tokens": 1038498120.0, + "reward": 1.4754464626312256, + "reward_std": 0.3356422185897827, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.4966535270214081, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19280149042606354, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1070.7076416015625, + "completions/mean_terminated_length": 811.2005615234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3803739811411219, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13722437240310628, + "kl": 0.020111083984375, + "learning_rate": 8.015874650603776e-07, + "loss": 0.0765, + "num_tokens": 1039048069.0, + "reward": 1.477678656578064, + "reward_std": 0.3419429659843445, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18898223340511322, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1088.716552734375, + "completions/mean_terminated_length": 889.6199340820312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3805870758084279, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1276116399540212, + "kl": 0.0164794921875, + "learning_rate": 8.013098371458715e-07, + "loss": 0.1002, + "num_tokens": 1039612694.0, + "reward": 1.3811384439468384, + "reward_std": 0.333150714635849, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1885978728532791, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1125.1629638671875, + "completions/mean_terminated_length": 893.164794921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38080017047573383, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12871077717206791, + "kl": 0.0165252685546875, + "learning_rate": 8.010320701506067e-07, + "loss": 0.1023, + "num_tokens": 1040191887.0, + "reward": 1.4654018878936768, + "reward_std": 0.3638942837715149, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.1945772022008896, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1905.0, + "completions/mean_length": 1018.04248046875, + "completions/mean_terminated_length": 776.8677978515625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3810132651430398, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11907040379227167, + "kl": 0.019622802734375, + "learning_rate": 8.007541642283058e-07, + "loss": 0.0506, + "num_tokens": 1040719698.0, + "reward": 1.4319196939468384, + "reward_std": 0.3242812156677246, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16082797944545746, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1119.712158203125, + "completions/mean_terminated_length": 817.6065063476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38122635981034575, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11750005033460884, + "kl": 0.016510009765625, + "learning_rate": 8.004761195327689e-07, + "loss": 0.0897, + "num_tokens": 1041285601.0, + "reward": 1.4079241752624512, + "reward_std": 0.3554452061653137, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1549724042415619, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 911.7076416015625, + "completions/mean_terminated_length": 759.2430419921875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.3814394544776517, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13687215166950273, + "kl": 0.02166748046875, + "learning_rate": 8.001979362178718e-07, + "loss": 0.0819, + "num_tokens": 1041763710.0, + "reward": 1.6082589626312256, + "reward_std": 0.32452070713043213, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.4769369065761566, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.15995624661445618, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1004.9152221679688, + "completions/mean_terminated_length": 815.0132446289062, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.38165254914495766, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12917234766735372, + "kl": 0.02081298828125, + "learning_rate": 7.999196144375682e-07, + "loss": 0.0693, + "num_tokens": 1042286840.0, + "reward": 1.5161831378936768, + "reward_std": 0.37198200821876526, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18449558317661285, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1094.328125, + "completions/mean_terminated_length": 851.2352905273438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3818656438122636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13049530246507265, + "kl": 0.01922607421875, + "learning_rate": 7.996411543458876e-07, + "loss": 0.0918, + "num_tokens": 1042844075.0, + "reward": 1.4056919813156128, + "reward_std": 0.37510567903518677, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.21352127194404602, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1134.0223388671875, + "completions/mean_terminated_length": 904.2513427734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3820787384795695, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13103210607016752, + "kl": 0.018707275390625, + "learning_rate": 7.993625560969366e-07, + "loss": 0.0994, + "num_tokens": 1043423461.0, + "reward": 1.4017857313156128, + "reward_std": 0.34620559215545654, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.19571910798549652, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1093.5067138671875, + "completions/mean_terminated_length": 801.3148803710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3822918331468755, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12546049730504288, + "kl": 0.018096923828125, + "learning_rate": 7.990838198448979e-07, + "loss": 0.0762, + "num_tokens": 1043983560.0, + "reward": 1.4068081378936768, + "reward_std": 0.2768586277961731, + "rewards/accuracy_reward/mean": 0.4791666567325592, + "rewards/accuracy_reward/std": 0.5001450181007385, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.18820028007030487, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 918.8482666015625, + "completions/mean_terminated_length": 747.5886840820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38250492781418144, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1373295121488945, + "kl": 0.022247314453125, + "learning_rate": 7.988049457440306e-07, + "loss": 0.0831, + "num_tokens": 1044458660.0, + "reward": 1.5507813692092896, + "reward_std": 0.3210321366786957, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.20957329869270325, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 939.997802734375, + "completions/mean_terminated_length": 762.0285034179688, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.3827180224814874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1325885978852418, + "kl": 0.02081298828125, + "learning_rate": 7.985259339486701e-07, + "loss": 0.0997, + "num_tokens": 1044944867.0, + "reward": 1.5842634439468384, + "reward_std": 0.32810935378074646, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.1818443238735199, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 966.9933471679688, + "completions/mean_terminated_length": 809.4041137695312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.38293111714879335, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12866016966276977, + "kl": 0.017669677734375, + "learning_rate": 7.982467846132285e-07, + "loss": 0.0525, + "num_tokens": 1045449760.0, + "reward": 1.614397406578064, + "reward_std": 0.3444443941116333, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.151633620262146, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 937.80810546875, + "completions/mean_terminated_length": 785.6497192382812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3831442118160993, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12781260420368323, + "kl": 0.019561767578125, + "learning_rate": 7.97967497892193e-07, + "loss": 0.0503, + "num_tokens": 1045936122.0, + "reward": 1.5563616752624512, + "reward_std": 0.31537291407585144, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.16368533670902252, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 902.8995971679688, + "completions/mean_terminated_length": 768.685791015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38335730648340527, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13514006746352134, + "kl": 0.02056884765625, + "learning_rate": 7.976880739401279e-07, + "loss": 0.0772, + "num_tokens": 1046409261.0, + "reward": 1.6138393878936768, + "reward_std": 0.28237566351890564, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378541469574, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.12561768293380737, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1051.321533203125, + "completions/mean_terminated_length": 841.2108154296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3835704011507112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.144307215464622, + "kl": 0.0185546875, + "learning_rate": 7.974085129116726e-07, + "loss": 0.09, + "num_tokens": 1046952141.0, + "reward": 1.5279018878936768, + "reward_std": 0.32799944281578064, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.17489881813526154, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1081.53125, + "completions/mean_terminated_length": 835.176513671875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.38378349581801713, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11958077567803473, + "kl": 0.0184326171875, + "learning_rate": 7.971288149615431e-07, + "loss": 0.068, + "num_tokens": 1047505259.0, + "reward": 1.5212054252624512, + "reward_std": 0.3050203025341034, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.15772415697574615, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1085.8348388671875, + "completions/mean_terminated_length": 857.254150390625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3839965904853231, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1258723384720399, + "kl": 0.019073486328125, + "learning_rate": 7.968489802445305e-07, + "loss": 0.0668, + "num_tokens": 1048057009.0, + "reward": 1.5117188692092896, + "reward_std": 0.3104957342147827, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.17591212689876556, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 893.15185546875, + "completions/mean_terminated_length": 748.0703735351562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38420968515262904, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13237794076585396, + "kl": 0.022216796875, + "learning_rate": 7.965690089155022e-07, + "loss": 0.1252, + "num_tokens": 1048515141.0, + "reward": 1.645647406578064, + "reward_std": 0.24935492873191833, + "rewards/accuracy_reward/mean": 0.6964285969734192, + "rewards/accuracy_reward/std": 0.4603137671947479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16321179270744324, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1075.966552734375, + "completions/mean_terminated_length": 861.4304809570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.384422779819935, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1350488368669142, + "kl": 0.018310546875, + "learning_rate": 7.962889011294004e-07, + "loss": 0.1106, + "num_tokens": 1049074070.0, + "reward": 1.4827009439468384, + "reward_std": 0.3695772886276245, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.186005100607872, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1073.102783203125, + "completions/mean_terminated_length": 848.1264038085938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.38463587448724096, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11183417831967345, + "kl": 0.016937255859375, + "learning_rate": 7.960086570412439e-07, + "loss": 0.0665, + "num_tokens": 1049626004.0, + "reward": 1.4626116752624512, + "reward_std": 0.33566537499427795, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920448660850525, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.18208445608615875, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1207.359375, + "completions/mean_terminated_length": 930.4718017578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3848489691545469, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10371128354773383, + "kl": 0.0159149169921875, + "learning_rate": 7.957282768061258e-07, + "loss": 0.0673, + "num_tokens": 1050236629.0, + "reward": 1.4469866752624512, + "reward_std": 0.32617658376693726, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19121423363685608, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 921.5982666015625, + "completions/mean_terminated_length": 754.0820922851562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3850620638218529, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14412281616411, + "kl": 0.018768310546875, + "learning_rate": 7.954477605792157e-07, + "loss": 0.0978, + "num_tokens": 1050709985.0, + "reward": 1.5407366752624512, + "reward_std": 0.3019663989543915, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.16706722974777222, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1039.74560546875, + "completions/mean_terminated_length": 807.0714721679688, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.38527515848915883, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11914238745813858, + "kl": 0.016998291015625, + "learning_rate": 7.951671085157574e-07, + "loss": 0.1028, + "num_tokens": 1051245023.0, + "reward": 1.32421875, + "reward_std": 0.2977350652217865, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.2039153277873993, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1148.82373046875, + "completions/mean_terminated_length": 947.3688354492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38548825315646473, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11345135170374829, + "kl": 0.016510009765625, + "learning_rate": 7.948863207710704e-07, + "loss": 0.0751, + "num_tokens": 1051830848.0, + "reward": 1.4447544813156128, + "reward_std": 0.35639041662216187, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.18368858098983765, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 978.29248046875, + "completions/mean_terminated_length": 840.8740234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3857013478237707, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12557491440026874, + "kl": 0.017974853515625, + "learning_rate": 7.946053975005494e-07, + "loss": 0.0981, + "num_tokens": 1052341459.0, + "reward": 1.4592634439468384, + "reward_std": 0.3712087571620941, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.16924987733364105, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 954.3683471679688, + "completions/mean_terminated_length": 810.7600708007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38591444249107665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14545143970999172, + "kl": 0.020477294921875, + "learning_rate": 7.943243388596638e-07, + "loss": 0.0665, + "num_tokens": 1052833640.0, + "reward": 1.6060268878936768, + "reward_std": 0.3298066258430481, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1707671582698822, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1110.19873046875, + "completions/mean_terminated_length": 819.5350952148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3861275371583826, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12294192509071616, + "kl": 0.017669677734375, + "learning_rate": 7.940431450039581e-07, + "loss": 0.052, + "num_tokens": 1053405969.0, + "reward": 1.3616071939468384, + "reward_std": 0.34001073241233826, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.17284893989562988, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1049.747802734375, + "completions/mean_terminated_length": 822.7479858398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38634063182568856, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11562395466354608, + "kl": 0.0179901123046875, + "learning_rate": 7.937618160890516e-07, + "loss": 0.0674, + "num_tokens": 1053938992.0, + "reward": 1.4860491752624512, + "reward_std": 0.2867286801338196, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17932796478271484, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 973.4553833007812, + "completions/mean_terminated_length": 777.8258666992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3865537264929945, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14547910544433923, + "kl": 0.02227783203125, + "learning_rate": 7.934803522706382e-07, + "loss": 0.1225, + "num_tokens": 1054445852.0, + "reward": 1.3750001192092896, + "reward_std": 0.3737272024154663, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.1964321881532669, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 993.1160888671875, + "completions/mean_terminated_length": 833.120849609375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3867668211603005, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13114182615344394, + "kl": 0.018829345703125, + "learning_rate": 7.931987537044867e-07, + "loss": 0.1055, + "num_tokens": 1054961200.0, + "reward": 1.5368304252624512, + "reward_std": 0.3504135012626648, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18948033452033997, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1064.921875, + "completions/mean_terminated_length": 864.0779418945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38697991582760644, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11197691250029149, + "kl": 0.017822265625, + "learning_rate": 7.929170205464403e-07, + "loss": 0.086, + "num_tokens": 1055509261.0, + "reward": 1.5915179252624512, + "reward_std": 0.3448924124240875, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.16981780529022217, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1127.169677734375, + "completions/mean_terminated_length": 869.337158203125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.38719301049491234, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12644331294454494, + "kl": 0.016204833984375, + "learning_rate": 7.926351529524166e-07, + "loss": 0.09, + "num_tokens": 1056081337.0, + "reward": 1.4436384439468384, + "reward_std": 0.4390977621078491, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.20423343777656555, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 992.7835083007812, + "completions/mean_terminated_length": 763.3886108398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3874061051622183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14332686167695827, + "kl": 0.018951416015625, + "learning_rate": 7.923531510784081e-07, + "loss": 0.1209, + "num_tokens": 1056601720.0, + "reward": 1.5172991752624512, + "reward_std": 0.4139867126941681, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.22437745332717896, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1030.4442138671875, + "completions/mean_terminated_length": 851.50390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38761919982952425, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11961041495857284, + "kl": 0.017364501953125, + "learning_rate": 7.920710150804809e-07, + "loss": 0.101, + "num_tokens": 1057132607.0, + "reward": 1.5055804252624512, + "reward_std": 0.34474942088127136, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.1887938678264618, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1146.13623046875, + "completions/mean_terminated_length": 890.306640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3878322944968302, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12147003152750725, + "kl": 0.0174560546875, + "learning_rate": 7.917887451147758e-07, + "loss": 0.0818, + "num_tokens": 1057719036.0, + "reward": 1.430803656578064, + "reward_std": 0.376973420381546, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.22532203793525696, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 931.3058471679688, + "completions/mean_terminated_length": 768.5140380859375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.38804538916413617, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13149656813654068, + "kl": 0.021728515625, + "learning_rate": 7.915063413375077e-07, + "loss": 0.0757, + "num_tokens": 1058203349.0, + "reward": 1.5658482313156128, + "reward_std": 0.3202120363712311, + "rewards/accuracy_reward/mean": 0.6527777910232544, + "rewards/accuracy_reward/std": 0.47663912177085876, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.1781519651412964, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1005.2545166015625, + "completions/mean_terminated_length": 818.657958984375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3882584838314421, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13781638547097327, + "kl": 0.01904296875, + "learning_rate": 7.912238039049653e-07, + "loss": 0.0593, + "num_tokens": 1058719591.0, + "reward": 1.5267857313156128, + "reward_std": 0.2963954508304596, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.16070762276649475, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1020.0670166015625, + "completions/mean_terminated_length": 829.708984375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3884715784987481, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12979604106868722, + "kl": 0.02178955078125, + "learning_rate": 7.909411329735117e-07, + "loss": 0.0919, + "num_tokens": 1059237957.0, + "reward": 1.5585938692092896, + "reward_std": 0.2846677303314209, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.1705797165632248, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1189.044677734375, + "completions/mean_terminated_length": 895.8682861328125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.38868467316605404, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 3.1625034798004856, + "kl": 0.237060546875, + "learning_rate": 7.906583286995834e-07, + "loss": 0.098, + "num_tokens": 1059840489.0, + "reward": 1.3465402126312256, + "reward_std": 0.40076354146003723, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9090401530265808, + "rewards/tag_count_reward/std": 0.24166594445705414, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1118.66748046875, + "completions/mean_terminated_length": 878.5028076171875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.38889776783336, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12691727788630341, + "kl": 0.019866943359375, + "learning_rate": 7.90375391239691e-07, + "loss": 0.0692, + "num_tokens": 1060411828.0, + "reward": 1.5892857313156128, + "reward_std": 0.32817450165748596, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.1884397715330124, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1067.294677734375, + "completions/mean_terminated_length": 863.7520141601562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3891108625006659, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12953045236246757, + "kl": 0.0185546875, + "learning_rate": 7.900923207504185e-07, + "loss": 0.0745, + "num_tokens": 1060959224.0, + "reward": 1.4514509439468384, + "reward_std": 0.34199219942092896, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.192632257938385, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1022.4464721679688, + "completions/mean_terminated_length": 816.2359619140625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.38932395716797186, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11905626036850288, + "kl": 0.01715087890625, + "learning_rate": 7.898091173884243e-07, + "loss": 0.0669, + "num_tokens": 1061487712.0, + "reward": 1.4687501192092896, + "reward_std": 0.31353992223739624, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.1628681719303131, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1008.7745971679688, + "completions/mean_terminated_length": 819.5752563476562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.3895370518352778, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12384514271152347, + "kl": 0.017974853515625, + "learning_rate": 7.895257813104393e-07, + "loss": 0.0826, + "num_tokens": 1062014379.0, + "reward": 1.532366156578064, + "reward_std": 0.33962759375572205, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19537115097045898, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1019.7388916015625, + "completions/mean_terminated_length": 799.5962524414062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.38975014650258377, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1301693090603972, + "kl": 0.018463134765625, + "learning_rate": 7.892423126732684e-07, + "loss": 0.0799, + "num_tokens": 1062537878.0, + "reward": 1.567522406578064, + "reward_std": 0.35119324922561646, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.2017921358346939, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1026.640625, + "completions/mean_terminated_length": 827.8159790039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.38996324116988973, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12255415322023203, + "kl": 0.018402099609375, + "learning_rate": 7.8895871163379e-07, + "loss": 0.0947, + "num_tokens": 1063075653.0, + "reward": 1.547991156578064, + "reward_std": 0.3486122190952301, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16169501841068268, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1092.96435546875, + "completions/mean_terminated_length": 866.077392578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3901763358371957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11795780774285654, + "kl": 0.018829345703125, + "learning_rate": 7.886749783489555e-07, + "loss": 0.0279, + "num_tokens": 1063639957.0, + "reward": 1.4330357313156128, + "reward_std": 0.32432830333709717, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.17770643532276154, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1048.759033203125, + "completions/mean_terminated_length": 879.1749267578125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.39038943050450164, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2065606739080652, + "kl": 0.0185546875, + "learning_rate": 7.883911129757894e-07, + "loss": 0.0537, + "num_tokens": 1064179625.0, + "reward": 1.4414063692092896, + "reward_std": 0.2965337038040161, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.16610048711299896, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1161.290283203125, + "completions/mean_terminated_length": 872.7160034179688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3906025251718076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11368656188594062, + "kl": 0.016510009765625, + "learning_rate": 7.881071156713893e-07, + "loss": 0.0721, + "num_tokens": 1064770507.0, + "reward": 1.3984376192092896, + "reward_std": 0.36735406517982483, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.2030172199010849, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1142.77685546875, + "completions/mean_terminated_length": 899.1614379882812, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.3908156198391135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1218887838560854, + "kl": 0.0152587890625, + "learning_rate": 7.878229865929266e-07, + "loss": 0.0943, + "num_tokens": 1065359431.0, + "reward": 1.4938616752624512, + "reward_std": 0.39353471994400024, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.21221931278705597, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1114.7879638671875, + "completions/mean_terminated_length": 870.3126220703125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.39102871450641946, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12822419870253496, + "kl": 0.016754150390625, + "learning_rate": 7.875387258976444e-07, + "loss": 0.0402, + "num_tokens": 1065938504.0, + "reward": 1.4280134439468384, + "reward_std": 0.33710843324661255, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18714261054992676, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1126.5179443359375, + "completions/mean_terminated_length": 861.72412109375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3912418091737254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12144290200081527, + "kl": 0.017669677734375, + "learning_rate": 7.872543337428595e-07, + "loss": 0.0272, + "num_tokens": 1066512800.0, + "reward": 1.434709906578064, + "reward_std": 0.34116995334625244, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.18177567422389984, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1130.0826416015625, + "completions/mean_terminated_length": 924.428955078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3914549038410314, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12528453007055942, + "kl": 0.017669677734375, + "learning_rate": 7.869698102859612e-07, + "loss": 0.0962, + "num_tokens": 1067089509.0, + "reward": 1.485491156578064, + "reward_std": 0.344857782125473, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.1880783587694168, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1053.1629638671875, + "completions/mean_terminated_length": 833.593994140625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.39166799850833733, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12110837430607493, + "kl": 0.0167388916015625, + "learning_rate": 7.866851556844115e-07, + "loss": 0.0843, + "num_tokens": 1067629150.0, + "reward": 1.3571429252624512, + "reward_std": 0.37157145142555237, + "rewards/accuracy_reward/mean": 0.44675925374031067, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.20843316614627838, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1221.9129638671875, + "completions/mean_terminated_length": 887.8526611328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3918810931756433, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1210614574211017, + "kl": 0.0151214599609375, + "learning_rate": 7.864003700957447e-07, + "loss": 0.0718, + "num_tokens": 1068256951.0, + "reward": 1.25, + "reward_std": 0.3642135560512543, + "rewards/accuracy_reward/mean": 0.35648149251937866, + "rewards/accuracy_reward/std": 0.47951504588127136, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.23560336232185364, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1035.671875, + "completions/mean_terminated_length": 832.1206665039062, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.39209418784294925, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13549143144958467, + "kl": 0.017303466796875, + "learning_rate": 7.861154536775679e-07, + "loss": 0.0923, + "num_tokens": 1068780980.0, + "reward": 1.383928656578064, + "reward_std": 0.38585197925567627, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9129464030265808, + "rewards/tag_count_reward/std": 0.22857847809791565, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1128.9263916015625, + "completions/mean_terminated_length": 894.6527099609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3923072825102552, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1272736979876836, + "kl": 0.0174560546875, + "learning_rate": 7.858304065875607e-07, + "loss": 0.0606, + "num_tokens": 1069357331.0, + "reward": 1.3727679252624512, + "reward_std": 0.31531792879104614, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20194751024246216, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1126.9554443359375, + "completions/mean_terminated_length": 858.8703002929688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3925203771775611, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.35038197534731513, + "kl": 0.017913818359375, + "learning_rate": 7.855452289834746e-07, + "loss": 0.0889, + "num_tokens": 1069937807.0, + "reward": 1.383928656578064, + "reward_std": 0.3546689748764038, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.23615379631519318, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1217.5848388671875, + "completions/mean_terminated_length": 896.2166748046875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.39273347184486707, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11493389150541113, + "kl": 0.015777587890625, + "learning_rate": 7.852599210231339e-07, + "loss": 0.088, + "num_tokens": 1070554629.0, + "reward": 1.4118304252624512, + "reward_std": 0.3486160635948181, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.21787147223949432, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1047.53125, + "completions/mean_terminated_length": 849.5775756835938, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.392946566512173, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12421459496491848, + "kl": 0.0166473388671875, + "learning_rate": 7.849744828644344e-07, + "loss": 0.094, + "num_tokens": 1071090563.0, + "reward": 1.5351563692092896, + "reward_std": 0.381756991147995, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19872502982616425, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 1018.435302734375, + "completions/mean_terminated_length": 784.3150634765625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.393159661179479, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14857553154008735, + "kl": 0.02093505859375, + "learning_rate": 7.846889146653445e-07, + "loss": 0.1141, + "num_tokens": 1071614678.0, + "reward": 1.4743304252624512, + "reward_std": 0.382669061422348, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.2122759371995926, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1133.5223388671875, + "completions/mean_terminated_length": 890.6949462890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.39337275584678494, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14152765966101366, + "kl": 0.019744873046875, + "learning_rate": 7.84403216583904e-07, + "loss": 0.0681, + "num_tokens": 1072197408.0, + "reward": 1.4324777126312256, + "reward_std": 0.35994189977645874, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.2239653617143631, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 993.9464721679688, + "completions/mean_terminated_length": 743.5359497070312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3935858505140909, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12327990144160939, + "kl": 0.02008056640625, + "learning_rate": 7.841173887782253e-07, + "loss": 0.1227, + "num_tokens": 1072716520.0, + "reward": 1.5580357313156128, + "reward_std": 0.31139156222343445, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.194285050034523, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1206.305908203125, + "completions/mean_terminated_length": 919.02099609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.39379894518139685, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18729405839003072, + "kl": 0.014617919921875, + "learning_rate": 7.838314314064922e-07, + "loss": 0.0849, + "num_tokens": 1073330929.0, + "reward": 1.4648438692092896, + "reward_std": 0.39464667439460754, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.21064873039722443, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1034.259033203125, + "completions/mean_terminated_length": 810.5177001953125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3940120398487028, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1259947520500176, + "kl": 0.01678466796875, + "learning_rate": 7.8354534462696e-07, + "loss": 0.0736, + "num_tokens": 1073860453.0, + "reward": 1.4955357313156128, + "reward_std": 0.31719037890434265, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.18313127756118774, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1131.0982666015625, + "completions/mean_terminated_length": 867.6206665039062, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.3942251345160087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12491957119243292, + "kl": 0.016632080078125, + "learning_rate": 7.832591285979559e-07, + "loss": 0.0607, + "num_tokens": 1074436177.0, + "reward": 1.3828126192092896, + "reward_std": 0.35941779613494873, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.20094045996665955, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1162.1116943359375, + "completions/mean_terminated_length": 866.8154907226562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.39443822918331467, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13605322173728168, + "kl": 0.018218994140625, + "learning_rate": 7.829727834778786e-07, + "loss": 0.0502, + "num_tokens": 1075033779.0, + "reward": 1.3610491752624512, + "reward_std": 0.26947885751724243, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.18787497282028198, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 989.1317138671875, + "completions/mean_terminated_length": 772.8037719726562, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.39465132385062063, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13539899688717502, + "kl": 0.019287109375, + "learning_rate": 7.82686309425198e-07, + "loss": 0.1039, + "num_tokens": 1075545582.0, + "reward": 1.4732143878936768, + "reward_std": 0.34325820207595825, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19134558737277985, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1127.138427734375, + "completions/mean_terminated_length": 834.6294555664062, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3948644185179266, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1155484871912658, + "kl": 0.0164337158203125, + "learning_rate": 7.82399706598456e-07, + "loss": 0.0934, + "num_tokens": 1076113468.0, + "reward": 1.4391741752624512, + "reward_std": 0.33238133788108826, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19942738115787506, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 951.9754638671875, + "completions/mean_terminated_length": 749.0079345703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.39507751318523254, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14515584977915927, + "kl": 0.01806640625, + "learning_rate": 7.82112975156265e-07, + "loss": 0.0801, + "num_tokens": 1076611953.0, + "reward": 1.5106027126312256, + "reward_std": 0.2995833158493042, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1885978728532791, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 986.513427734375, + "completions/mean_terminated_length": 822.365966796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3952906078525385, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13106390039802518, + "kl": 0.019287109375, + "learning_rate": 7.81826115257309e-07, + "loss": 0.1322, + "num_tokens": 1077117431.0, + "reward": 1.5998884439468384, + "reward_std": 0.3600275218486786, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.19280068576335907, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1068.0670166015625, + "completions/mean_terminated_length": 831.9058227539062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.39550370251984446, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12260616543191685, + "kl": 0.0161895751953125, + "learning_rate": 7.81539127060343e-07, + "loss": 0.0843, + "num_tokens": 1077668917.0, + "reward": 1.4012277126312256, + "reward_std": 0.33150237798690796, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19662198424339294, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 930.7879638671875, + "completions/mean_terminated_length": 741.1828002929688, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3957167971871504, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13830920474611677, + "kl": 0.020294189453125, + "learning_rate": 7.812520107241929e-07, + "loss": 0.094, + "num_tokens": 1078150134.0, + "reward": 1.5485491752624512, + "reward_std": 0.3066059947013855, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.16355563700199127, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1094.1607666015625, + "completions/mean_terminated_length": 854.3687133789062, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3959298918544563, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13671912048210633, + "kl": 0.0186767578125, + "learning_rate": 7.809647664077557e-07, + "loss": 0.1144, + "num_tokens": 1078700510.0, + "reward": 1.3945313692092896, + "reward_std": 0.32575950026512146, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.21163024008274078, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1004.747802734375, + "completions/mean_terminated_length": 814.8153076171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3961429865217623, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.23542843650774636, + "kl": 0.01959228515625, + "learning_rate": 7.806773942699992e-07, + "loss": 0.0776, + "num_tokens": 1079226429.0, + "reward": 1.5033482313156128, + "reward_std": 0.3194870948791504, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.1717585176229477, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 983.3460083007812, + "completions/mean_terminated_length": 786.1878051757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.39635608118906823, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13045063785346978, + "kl": 0.01837158203125, + "learning_rate": 7.803898944699619e-07, + "loss": 0.1571, + "num_tokens": 1079732568.0, + "reward": 1.4888393878936768, + "reward_std": 0.35875171422958374, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18981274962425232, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1037.3773193359375, + "completions/mean_terminated_length": 804.1566162109375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3965691758563742, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14387530623122705, + "kl": 0.0184326171875, + "learning_rate": 7.801022671667528e-07, + "loss": 0.0643, + "num_tokens": 1080265441.0, + "reward": 1.4983259439468384, + "reward_std": 0.32769328355789185, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19555239379405975, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 971.857177734375, + "completions/mean_terminated_length": 775.9367065429688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.39678227052368015, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1368111486404299, + "kl": 0.019744873046875, + "learning_rate": 7.798145125195515e-07, + "loss": 0.0495, + "num_tokens": 1080766849.0, + "reward": 1.4921876192092896, + "reward_std": 0.2906310558319092, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19537116587162018, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1109.904052734375, + "completions/mean_terminated_length": 833.35546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3969953651909861, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12390137295764388, + "kl": 0.017791748046875, + "learning_rate": 7.795266306876084e-07, + "loss": 0.1167, + "num_tokens": 1081329734.0, + "reward": 1.4475446939468384, + "reward_std": 0.32965415716171265, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.17320609092712402, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1075.1629638671875, + "completions/mean_terminated_length": 820.3070068359375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.39720845985829206, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12405760210112007, + "kl": 0.0166015625, + "learning_rate": 7.79238621830244e-07, + "loss": 0.0873, + "num_tokens": 1081880607.0, + "reward": 1.465959906578064, + "reward_std": 0.3064712882041931, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17189201712608337, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1096.919677734375, + "completions/mean_terminated_length": 867.7119140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.397421554525598, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11499181809144696, + "kl": 0.017364501953125, + "learning_rate": 7.789504861068492e-07, + "loss": 0.0899, + "num_tokens": 1082443883.0, + "reward": 1.5412946939468384, + "reward_std": 0.30650460720062256, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.18805180490016937, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1171.3326416015625, + "completions/mean_terminated_length": 850.6005859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3976346491929039, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1130525972229058, + "kl": 0.016204833984375, + "learning_rate": 7.786622236768849e-07, + "loss": 0.0612, + "num_tokens": 1083033760.0, + "reward": 1.4525669813156128, + "reward_std": 0.3609563410282135, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.2174951434135437, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 992.6920166015625, + "completions/mean_terminated_length": 777.0913696289062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3978477438602099, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14285254291358337, + "kl": 0.02239990234375, + "learning_rate": 7.783738346998825e-07, + "loss": 0.0879, + "num_tokens": 1083550230.0, + "reward": 1.5189732313156128, + "reward_std": 0.30908212065696716, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.2057536393404007, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 917.1183471679688, + "completions/mean_terminated_length": 748.9359130859375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.39806083852751584, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1283459036976161, + "kl": 0.02227783203125, + "learning_rate": 7.780853193354431e-07, + "loss": 0.0685, + "num_tokens": 1084027579.0, + "reward": 1.5033482313156128, + "reward_std": 0.34642207622528076, + "rewards/accuracy_reward/mean": 0.5817307829856873, + "rewards/accuracy_reward/std": 0.49386879801750183, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.13496457040309906, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1048.6473388671875, + "completions/mean_terminated_length": 814.63916015625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.3982739331948218, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11832895675423899, + "kl": 0.017364501953125, + "learning_rate": 7.777966777432379e-07, + "loss": 0.0695, + "num_tokens": 1084567341.0, + "reward": 1.5172991752624512, + "reward_std": 0.29939672350883484, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.19084173440933228, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1030.6875, + "completions/mean_terminated_length": 832.650634765625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.39848702786212775, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1290271125551043, + "kl": 0.019989013671875, + "learning_rate": 7.775079100830078e-07, + "loss": 0.0513, + "num_tokens": 1085092993.0, + "reward": 1.4547991752624512, + "reward_std": 0.27645638585090637, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19944614171981812, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1022.8928833007812, + "completions/mean_terminated_length": 803.4254760742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3987001225294337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13860128657505907, + "kl": 0.017364501953125, + "learning_rate": 7.772190165145638e-07, + "loss": 0.0616, + "num_tokens": 1085617569.0, + "reward": 1.4927456378936768, + "reward_std": 0.37611469626426697, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.16943417489528656, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1092.165283203125, + "completions/mean_terminated_length": 887.5285034179688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.39891321719673967, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12031813342865773, + "kl": 0.01739501953125, + "learning_rate": 7.769299971977864e-07, + "loss": 0.0614, + "num_tokens": 1086178315.0, + "reward": 1.5094866752624512, + "reward_std": 0.3737342655658722, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.2104293406009674, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1002.1116333007812, + "completions/mean_terminated_length": 843.480712890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3991263118640456, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12344596734662744, + "kl": 0.017730712890625, + "learning_rate": 7.766408522926254e-07, + "loss": 0.1013, + "num_tokens": 1086688589.0, + "reward": 1.571428656578064, + "reward_std": 0.3135511577129364, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1613743007183075, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1017.560302734375, + "completions/mean_terminated_length": 758.5111694335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3993394065313515, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14424721796527923, + "kl": 0.020660400390625, + "learning_rate": 7.763515819591006e-07, + "loss": 0.0896, + "num_tokens": 1087211928.0, + "reward": 1.5251116752624512, + "reward_std": 0.3216903805732727, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.20596210658550262, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 977.075927734375, + "completions/mean_terminated_length": 814.6478271484375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.3995525011986575, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13209523098902112, + "kl": 0.019683837890625, + "learning_rate": 7.760621863573009e-07, + "loss": 0.0777, + "num_tokens": 1087717994.0, + "reward": 1.540178656578064, + "reward_std": 0.33858421444892883, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.16601119935512543, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 982.8594360351562, + "completions/mean_terminated_length": 772.109619140625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.39976559586596344, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12241846282892788, + "kl": 0.018157958984375, + "learning_rate": 7.757726656473846e-07, + "loss": 0.0988, + "num_tokens": 1088223739.0, + "reward": 1.5530134439468384, + "reward_std": 0.30941325426101685, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1944383978843689, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1097.419677734375, + "completions/mean_terminated_length": 834.7236328125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.3999786905332694, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11712741179211929, + "kl": 0.0165252685546875, + "learning_rate": 7.754830199895793e-07, + "loss": 0.0794, + "num_tokens": 1088787607.0, + "reward": 1.3789063692092896, + "reward_std": 0.34759923815727234, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.22595205903053284, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 942.6116333007812, + "completions/mean_terminated_length": 778.2205200195312, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.40019178520057536, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11591270389671879, + "kl": 0.019775390625, + "learning_rate": 7.751932495441818e-07, + "loss": 0.072, + "num_tokens": 1089277593.0, + "reward": 1.5585938692092896, + "reward_std": 0.32834160327911377, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19267114996910095, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1050.2210693359375, + "completions/mean_terminated_length": 813.1795654296875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4004048798678813, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13845436320110055, + "kl": 0.016998291015625, + "learning_rate": 7.749033544715576e-07, + "loss": 0.111, + "num_tokens": 1089820652.0, + "reward": 1.4603794813156128, + "reward_std": 0.3966957628726959, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.2092752456665039, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1063.700927734375, + "completions/mean_terminated_length": 826.4874877929688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.40061797453518727, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12853013638243657, + "kl": 0.0180511474609375, + "learning_rate": 7.746133349321416e-07, + "loss": 0.0622, + "num_tokens": 1090374870.0, + "reward": 1.4559152126312256, + "reward_std": 0.3015094995498657, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20909620821475983, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1062.618408203125, + "completions/mean_terminated_length": 821.7472534179688, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.40083106920249323, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1328552826149423, + "kl": 0.019012451171875, + "learning_rate": 7.743231910864376e-07, + "loss": 0.0806, + "num_tokens": 1090922747.0, + "reward": 1.5039063692092896, + "reward_std": 0.2862297296524048, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18560869991779327, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1038.857177734375, + "completions/mean_terminated_length": 861.3963012695312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.40104416386979913, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12812715566365404, + "kl": 0.018402099609375, + "learning_rate": 7.740329230950175e-07, + "loss": 0.1159, + "num_tokens": 1091457659.0, + "reward": 1.4302456378936768, + "reward_std": 0.334820419549942, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1833348423242569, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1086.685302734375, + "completions/mean_terminated_length": 880.8753662109375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4012572585371051, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11487268419937838, + "kl": 0.0164031982421875, + "learning_rate": 7.737425311185229e-07, + "loss": 0.0655, + "num_tokens": 1092012654.0, + "reward": 1.465959906578064, + "reward_std": 0.3478304445743561, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.16876234114170074, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1025.4241943359375, + "completions/mean_terminated_length": 792.8931884765625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.40147035320441105, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13249806583037582, + "kl": 0.01959228515625, + "learning_rate": 7.734520153176635e-07, + "loss": 0.045, + "num_tokens": 1092536748.0, + "reward": 1.520647406578064, + "reward_std": 0.3310166001319885, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.1667456179857254, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1179.7098388671875, + "completions/mean_terminated_length": 907.255126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.401683447871717, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1190475966080052, + "kl": 0.017578125, + "learning_rate": 7.731613758532173e-07, + "loss": 0.114, + "num_tokens": 1093134602.0, + "reward": 1.4575893878936768, + "reward_std": 0.359716534614563, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.20299570262432098, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1116.165283203125, + "completions/mean_terminated_length": 881.905029296875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.40189654253902296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12008452794924968, + "kl": 0.0174560546875, + "learning_rate": 7.728706128860309e-07, + "loss": 0.0784, + "num_tokens": 1093705172.0, + "reward": 1.4090402126312256, + "reward_std": 0.3371075987815857, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.1933954805135727, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1038.13623046875, + "completions/mean_terminated_length": 847.9495849609375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4021096372063289, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17287242026321614, + "kl": 0.01898193359375, + "learning_rate": 7.725797265770199e-07, + "loss": 0.1004, + "num_tokens": 1094241873.0, + "reward": 1.4375001192092896, + "reward_std": 0.3682398200035095, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19212691485881805, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1085.3192138671875, + "completions/mean_terminated_length": 812.2378540039062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4023227318736349, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12364401603077671, + "kl": 0.019561767578125, + "learning_rate": 7.722887170871669e-07, + "loss": 0.0605, + "num_tokens": 1094798512.0, + "reward": 1.4676339626312256, + "reward_std": 0.2902711033821106, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.175497367978096, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 964.6473388671875, + "completions/mean_terminated_length": 816.16748046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40253582654094083, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12486356765018565, + "kl": 0.01806640625, + "learning_rate": 7.719975845775241e-07, + "loss": 0.077, + "num_tokens": 1095293490.0, + "reward": 1.5273438692092896, + "reward_std": 0.34471288323402405, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.1910770684480667, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1130.9910888671875, + "completions/mean_terminated_length": 913.13818359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4027489212082468, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1231492638390904, + "kl": 0.01641845703125, + "learning_rate": 7.717063292092104e-07, + "loss": 0.0982, + "num_tokens": 1095869678.0, + "reward": 1.520647406578064, + "reward_std": 0.3620290458202362, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.201184943318367, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1038.04248046875, + "completions/mean_terminated_length": 791.1638793945312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4029620158755527, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12213762455203987, + "kl": 0.0181121826171875, + "learning_rate": 7.71414951143414e-07, + "loss": 0.0623, + "num_tokens": 1096412961.0, + "reward": 1.5094866752624512, + "reward_std": 0.24504512548446655, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.16474206745624542, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 965.1897583007812, + "completions/mean_terminated_length": 768.055419921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40317511054285865, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12053049656910904, + "kl": 0.020294189453125, + "learning_rate": 7.711234505413896e-07, + "loss": 0.0646, + "num_tokens": 1096910502.0, + "reward": 1.5161831378936768, + "reward_std": 0.3053971827030182, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.17351123690605164, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1061.950927734375, + "completions/mean_terminated_length": 817.4985961914062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4033882052101646, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11493617999075069, + "kl": 0.0181884765625, + "learning_rate": 7.708318275644612e-07, + "loss": 0.0908, + "num_tokens": 1097459744.0, + "reward": 1.450334906578064, + "reward_std": 0.3695110082626343, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.24555550515651703, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1113.3482666015625, + "completions/mean_terminated_length": 861.8130493164062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40360129987747057, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 5.8237167183663345, + "kl": 0.294677734375, + "learning_rate": 7.705400823740194e-07, + "loss": 0.0736, + "num_tokens": 1098044252.0, + "reward": 1.4285714626312256, + "reward_std": 0.383600652217865, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.22875317931175232, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1049.53125, + "completions/mean_terminated_length": 819.1154174804688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4038143945447765, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12328979938218865, + "kl": 0.01873779296875, + "learning_rate": 7.702482151315229e-07, + "loss": 0.0841, + "num_tokens": 1098580426.0, + "reward": 1.4497768878936768, + "reward_std": 0.2986149191856384, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.17798370122909546, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 983.19873046875, + "completions/mean_terminated_length": 812.1683959960938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4040274892120825, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12273372867004068, + "kl": 0.019805908203125, + "learning_rate": 7.699562259984979e-07, + "loss": 0.0749, + "num_tokens": 1099091347.0, + "reward": 1.6110491752624512, + "reward_std": 0.31121882796287537, + "rewards/accuracy_reward/mean": 0.6651785969734192, + "rewards/accuracy_reward/std": 0.47245559096336365, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17706511914730072, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1118.154052734375, + "completions/mean_terminated_length": 881.1344604492188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.40424058387938844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12539019288658815, + "kl": 0.0167083740234375, + "learning_rate": 7.696641151365379e-07, + "loss": 0.0775, + "num_tokens": 1099660872.0, + "reward": 1.3431919813156128, + "reward_std": 0.34335654973983765, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17828068137168884, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1071.796875, + "completions/mean_terminated_length": 843.2093505859375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4044536785466944, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11245772545518806, + "kl": 0.018035888671875, + "learning_rate": 7.693718827073042e-07, + "loss": 0.0446, + "num_tokens": 1100209725.0, + "reward": 1.4135044813156128, + "reward_std": 0.34260883927345276, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.16131143271923065, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1123.669677734375, + "completions/mean_terminated_length": 851.17919921875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.4046667732140003, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1386534029995427, + "kl": 0.017730712890625, + "learning_rate": 7.690795288725247e-07, + "loss": 0.0865, + "num_tokens": 1100789625.0, + "reward": 1.4944196939468384, + "reward_std": 0.3147575855255127, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.19750650227069855, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1095.6138916015625, + "completions/mean_terminated_length": 875.8324584960938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.40487986788130625, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1410802082284113, + "kl": 0.0186767578125, + "learning_rate": 7.687870537939953e-07, + "loss": 0.0668, + "num_tokens": 1101351628.0, + "reward": 1.4375001192092896, + "reward_std": 0.3177659511566162, + "rewards/accuracy_reward/mean": 0.5393518805503845, + "rewards/accuracy_reward/std": 0.49902692437171936, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9174107313156128, + "rewards/tag_count_reward/std": 0.2240774929523468, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 946.5714721679688, + "completions/mean_terminated_length": 792.427490234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4050929625486122, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12392036962717298, + "kl": 0.018524169921875, + "learning_rate": 7.684944576335781e-07, + "loss": 0.0998, + "num_tokens": 1101844044.0, + "reward": 1.5463169813156128, + "reward_std": 0.2869468629360199, + "rewards/accuracy_reward/mean": 0.6087962985038757, + "rewards/accuracy_reward/std": 0.4885856807231903, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15647153556346893, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 917.83935546875, + "completions/mean_terminated_length": 788.5173950195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40530605721591817, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1178605409478237, + "kl": 0.01971435546875, + "learning_rate": 7.682017405532032e-07, + "loss": 0.0511, + "num_tokens": 1102314564.0, + "reward": 1.5747768878936768, + "reward_std": 0.2550079822540283, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.166563019156456, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1052.9375, + "completions/mean_terminated_length": 830.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4055191518832241, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13026846576980652, + "kl": 0.019622802734375, + "learning_rate": 7.679089027148668e-07, + "loss": 0.0778, + "num_tokens": 1102853192.0, + "reward": 1.4654018878936768, + "reward_std": 0.2923460006713867, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19537116587162018, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1042.513427734375, + "completions/mean_terminated_length": 843.5668334960938, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.4057322465505301, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11842871270811793, + "kl": 0.019256591796875, + "learning_rate": 7.676159442806321e-07, + "loss": 0.0434, + "num_tokens": 1103388478.0, + "reward": 1.532366156578064, + "reward_std": 0.3286937177181244, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.14629201591014862, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1080.46435546875, + "completions/mean_terminated_length": 837.22900390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40594534121783604, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11699158131722365, + "kl": 0.0172119140625, + "learning_rate": 7.673228654126292e-07, + "loss": 0.083, + "num_tokens": 1103935502.0, + "reward": 1.446428656578064, + "reward_std": 0.3618532121181488, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20673725008964539, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1024.890625, + "completions/mean_terminated_length": 844.9737548828125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.406158435885142, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11684507860441362, + "kl": 0.019012451171875, + "learning_rate": 7.670296662730552e-07, + "loss": 0.0661, + "num_tokens": 1104465485.0, + "reward": 1.5212054252624512, + "reward_std": 0.33779019117355347, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16402533650398254, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1201.3348388671875, + "completions/mean_terminated_length": 895.09423828125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4063715305524479, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11970346142741332, + "kl": 0.0160064697265625, + "learning_rate": 7.66736347024173e-07, + "loss": 0.0853, + "num_tokens": 1105079075.0, + "reward": 1.3353794813156128, + "reward_std": 0.3123358488082886, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.15823286771774292, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 978.6138916015625, + "completions/mean_terminated_length": 838.1893920898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40658462521975386, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11721629706218116, + "kl": 0.017791748046875, + "learning_rate": 7.664429078283127e-07, + "loss": 0.0412, + "num_tokens": 1105587894.0, + "reward": 1.4843751192092896, + "reward_std": 0.2872348725795746, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1342271864414215, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1116.118408203125, + "completions/mean_terminated_length": 868.6694946289062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4067977198870598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12476349915574803, + "kl": 0.0177001953125, + "learning_rate": 7.6614934884787e-07, + "loss": 0.0751, + "num_tokens": 1106160875.0, + "reward": 1.3978794813156128, + "reward_std": 0.37329185009002686, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.2233457714319229, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1034.493408203125, + "completions/mean_terminated_length": 800.607177734375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4070108145543658, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 1.6015496574119263, + "kl": 0.06658935546875, + "learning_rate": 7.658556702453075e-07, + "loss": 0.0797, + "num_tokens": 1106700488.0, + "reward": 1.5764509439468384, + "reward_std": 0.2860429286956787, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.18338249623775482, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 893.4397583007812, + "completions/mean_terminated_length": 735.2005004882812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.40722390922167173, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14125542459461077, + "kl": 0.02130126953125, + "learning_rate": 7.655618721831538e-07, + "loss": 0.0745, + "num_tokens": 1107166077.0, + "reward": 1.6250001192092896, + "reward_std": 0.27611470222473145, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.15008719265460968, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1029.94873046875, + "completions/mean_terminated_length": 835.0026245117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4074370038889777, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1212086610378566, + "kl": 0.018218994140625, + "learning_rate": 7.652679548240038e-07, + "loss": 0.0512, + "num_tokens": 1107697654.0, + "reward": 1.5920759439468384, + "reward_std": 0.3181268274784088, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.15465790033340454, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1011.9152221679688, + "completions/mean_terminated_length": 826.5105590820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40765009855628365, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1576189361046836, + "kl": 0.0196533203125, + "learning_rate": 7.649739183305183e-07, + "loss": 0.0849, + "num_tokens": 1108217472.0, + "reward": 1.4983259439468384, + "reward_std": 0.2966291010379791, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16827340424060822, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1028.84375, + "completions/mean_terminated_length": 810.6504516601562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4078631932235896, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12359925338967898, + "kl": 0.019989013671875, + "learning_rate": 7.646797628654236e-07, + "loss": 0.072, + "num_tokens": 1108745146.0, + "reward": 1.4665179252624512, + "reward_std": 0.3006556034088135, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.19207490980625153, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 992.872802734375, + "completions/mean_terminated_length": 804.060546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4080762878908955, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1183473827657037, + "kl": 0.020111083984375, + "learning_rate": 7.643854885915128e-07, + "loss": 0.0634, + "num_tokens": 1109257649.0, + "reward": 1.4598214626312256, + "reward_std": 0.3134208619594574, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.17832356691360474, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1189.3192138671875, + "completions/mean_terminated_length": 923.1783447265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.40828938255820146, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11657960379917942, + "kl": 0.0164947509765625, + "learning_rate": 7.640910956716437e-07, + "loss": 0.0986, + "num_tokens": 1109859152.0, + "reward": 1.465959906578064, + "reward_std": 0.3952101469039917, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.23859341442584991, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1006.3035888671875, + "completions/mean_terminated_length": 800.1925048828125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4085024772255074, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13964906422790405, + "kl": 0.01837158203125, + "learning_rate": 7.637965842687404e-07, + "loss": 0.0816, + "num_tokens": 1110381368.0, + "reward": 1.4670759439468384, + "reward_std": 0.3010718822479248, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.1591140478849411, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 983.74560546875, + "completions/mean_terminated_length": 766.3171997070312, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.4087155718928134, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13604018468250148, + "kl": 0.019744873046875, + "learning_rate": 7.635019545457923e-07, + "loss": 0.0765, + "num_tokens": 1110890566.0, + "reward": 1.454241156578064, + "reward_std": 0.29963263869285583, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.18175934255123138, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1196.1273193359375, + "completions/mean_terminated_length": 922.2212524414062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40892866656011934, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1177594379251397, + "kl": 0.016845703125, + "learning_rate": 7.632072066658549e-07, + "loss": 0.0621, + "num_tokens": 1111503631.0, + "reward": 1.31640625, + "reward_std": 0.35845157504081726, + "rewards/accuracy_reward/mean": 0.40509259700775146, + "rewards/accuracy_reward/std": 0.49147915840148926, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9235491156578064, + "rewards/tag_count_reward/std": 0.2190280258655548, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 933.2053833007812, + "completions/mean_terminated_length": 783.6253051757812, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4091417612274253, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11169386231734406, + "kl": 0.020538330078125, + "learning_rate": 7.62912340792048e-07, + "loss": 0.0108, + "num_tokens": 1111982843.0, + "reward": 1.5491071939468384, + "reward_std": 0.2688133418560028, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.1424395591020584, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1150.5335693359375, + "completions/mean_terminated_length": 899.2428588867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.40935485589473125, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11669236221881746, + "kl": 0.0157012939453125, + "learning_rate": 7.626173570875576e-07, + "loss": 0.0829, + "num_tokens": 1112570858.0, + "reward": 1.3465402126312256, + "reward_std": 0.33845916390419006, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.1933760941028595, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 974.4397583007812, + "completions/mean_terminated_length": 805.2222290039062, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.4095679505620372, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13847781885523142, + "kl": 0.018341064453125, + "learning_rate": 7.623222557156344e-07, + "loss": 0.1179, + "num_tokens": 1113073791.0, + "reward": 1.5920759439468384, + "reward_std": 0.38161033391952515, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19768577814102173, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1138.43310546875, + "completions/mean_terminated_length": 893.6487426757812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4097810452293431, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1033313550511004, + "kl": 0.077972412109375, + "learning_rate": 7.620270368395947e-07, + "loss": 0.0734, + "num_tokens": 1113657489.0, + "reward": 1.4017857313156128, + "reward_std": 0.3582235276699066, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18152911961078644, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1050.703125, + "completions/mean_terminated_length": 859.7313842773438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.40999413989664907, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13789160292773583, + "kl": 0.01885986328125, + "learning_rate": 7.617317006228193e-07, + "loss": 0.1267, + "num_tokens": 1114195644.0, + "reward": 1.4575893878936768, + "reward_std": 0.3317590653896332, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.16865228116512299, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1090.140625, + "completions/mean_terminated_length": 927.5796508789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.410207234563955, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13140616526494253, + "kl": 0.017974853515625, + "learning_rate": 7.614362472287543e-07, + "loss": 0.1185, + "num_tokens": 1114750507.0, + "reward": 1.5775669813156128, + "reward_std": 0.3307828903198242, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.1600995808839798, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1103.2701416015625, + "completions/mean_terminated_length": 919.3626708984375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.410420329231261, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11497199331048603, + "kl": 0.01812744140625, + "learning_rate": 7.611406768209105e-07, + "loss": 0.0985, + "num_tokens": 1115311012.0, + "reward": 1.4168527126312256, + "reward_std": 0.2822326123714447, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18409591913223267, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1101.82373046875, + "completions/mean_terminated_length": 847.1869506835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.41063342389856694, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12747365124474375, + "kl": 0.01995849609375, + "learning_rate": 7.608449895628636e-07, + "loss": 0.0669, + "num_tokens": 1115870309.0, + "reward": 1.4821429252624512, + "reward_std": 0.33656126260757446, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9263392686843872, + "rewards/tag_count_reward/std": 0.21254922449588776, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 970.497802734375, + "completions/mean_terminated_length": 835.1331787109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4108465185658729, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12675855682400994, + "kl": 0.0205078125, + "learning_rate": 7.605491856182537e-07, + "loss": 0.0337, + "num_tokens": 1116367428.0, + "reward": 1.5731027126312256, + "reward_std": 0.25458186864852905, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.15814605355262756, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 996.8147583007812, + "completions/mean_terminated_length": 843.5728759765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.41105961323317886, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12126914495141694, + "kl": 0.019561767578125, + "learning_rate": 7.602532651507858e-07, + "loss": 0.0526, + "num_tokens": 1116881601.0, + "reward": 1.5920759439468384, + "reward_std": 0.26761388778686523, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.1746300309896469, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1064.1875, + "completions/mean_terminated_length": 863.1935424804688, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4112727079004848, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11458607807511013, + "kl": 0.019317626953125, + "learning_rate": 7.599572283242291e-07, + "loss": 0.0365, + "num_tokens": 1117422437.0, + "reward": 1.4799107313156128, + "reward_std": 0.2579093873500824, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.19500340521335602, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1065.5023193359375, + "completions/mean_terminated_length": 828.7229614257812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4114858025677907, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12368527621923536, + "kl": 0.0172576904296875, + "learning_rate": 7.596610753024174e-07, + "loss": 0.0745, + "num_tokens": 1117963654.0, + "reward": 1.5055804252624512, + "reward_std": 0.33166131377220154, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.19743064045906067, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1040.33935546875, + "completions/mean_terminated_length": 840.9625854492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4116988972350967, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13770041230205, + "kl": 0.018218994140625, + "learning_rate": 7.593648062492486e-07, + "loss": 0.0743, + "num_tokens": 1118510462.0, + "reward": 1.4174107313156128, + "reward_std": 0.3346589505672455, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18684300780296326, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1000.0513916015625, + "completions/mean_terminated_length": 818.9921264648438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.41191199190240263, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1318098802807133, + "kl": 0.019622802734375, + "learning_rate": 7.590684213286852e-07, + "loss": 0.131, + "num_tokens": 1119025765.0, + "reward": 1.4838169813156128, + "reward_std": 0.34468933939933777, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.1652565598487854, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 936.466552734375, + "completions/mean_terminated_length": 747.8250732421875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4121250865697086, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11760137475832275, + "kl": 0.02081298828125, + "learning_rate": 7.587719207047534e-07, + "loss": 0.0443, + "num_tokens": 1119511462.0, + "reward": 1.4670759439468384, + "reward_std": 0.24830979108810425, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.15255293250083923, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1042.984375, + "completions/mean_terminated_length": 881.5569458007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.41233818123701454, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13707341835411435, + "kl": 0.01953125, + "learning_rate": 7.584753045415436e-07, + "loss": 0.0551, + "num_tokens": 1120053503.0, + "reward": 1.4804688692092896, + "reward_std": 0.30049338936805725, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.16282889246940613, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1118.8773193359375, + "completions/mean_terminated_length": 885.298828125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.4125512759043205, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12387350751807251, + "kl": 0.0186767578125, + "learning_rate": 7.581785730032102e-07, + "loss": 0.0891, + "num_tokens": 1120624888.0, + "reward": 1.4765626192092896, + "reward_std": 0.3474372923374176, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.21554414927959442, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1040.5513916015625, + "completions/mean_terminated_length": 878.733154296875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.41276437057162646, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12335153143584317, + "kl": 0.01910400390625, + "learning_rate": 7.578817262539713e-07, + "loss": 0.0834, + "num_tokens": 1121162815.0, + "reward": 1.5809152126312256, + "reward_std": 0.3612014353275299, + "rewards/accuracy_reward/mean": 0.6666666865348816, + "rewards/accuracy_reward/std": 0.47195106744766235, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19048160314559937, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 981.0491333007812, + "completions/mean_terminated_length": 819.2236328125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4129774652389324, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12399434327209002, + "kl": 0.0203857421875, + "learning_rate": 7.575847644581089e-07, + "loss": 0.0587, + "num_tokens": 1121666741.0, + "reward": 1.4988839626312256, + "reward_std": 0.283370703458786, + "rewards/accuracy_reward/mean": 0.5717592835426331, + "rewards/accuracy_reward/std": 0.49539753794670105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.18175935745239258, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1121.140625, + "completions/mean_terminated_length": 881.6151733398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4131905599062383, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11752852066908871, + "kl": 0.0165252685546875, + "learning_rate": 7.572876877799686e-07, + "loss": 0.104, + "num_tokens": 1122246404.0, + "reward": 1.4871652126312256, + "reward_std": 0.3367440402507782, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.17399263381958008, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1100.0335693359375, + "completions/mean_terminated_length": 838.059814453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4134036545735443, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11903454038818558, + "kl": 0.016693115234375, + "learning_rate": 7.569904963839598e-07, + "loss": 0.074, + "num_tokens": 1122809203.0, + "reward": 1.419084906578064, + "reward_std": 0.3181091248989105, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.16924987733364105, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 955.950927734375, + "completions/mean_terminated_length": 746.8350830078125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.41361674924085023, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14566865833926723, + "kl": 0.020263671875, + "learning_rate": 7.566931904345548e-07, + "loss": 0.0788, + "num_tokens": 1123305229.0, + "reward": 1.563616156578064, + "reward_std": 0.32548803091049194, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1611691564321518, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 982.5178833007812, + "completions/mean_terminated_length": 754.406494140625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4138298439081562, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13735596938993327, + "kl": 0.019866943359375, + "learning_rate": 7.563957700962899e-07, + "loss": 0.1024, + "num_tokens": 1123817813.0, + "reward": 1.5301339626312256, + "reward_std": 0.2228216826915741, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16823354363441467, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 997.7857666015625, + "completions/mean_terminated_length": 806.5858154296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.41404293857546215, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1164040307777098, + "kl": 0.01806640625, + "learning_rate": 7.560982355337647e-07, + "loss": 0.0168, + "num_tokens": 1124337957.0, + "reward": 1.4704241752624512, + "reward_std": 0.2959195673465729, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.1500861495733261, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 991.4553833007812, + "completions/mean_terminated_length": 785.7813110351562, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4142560332427681, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1472234281520342, + "kl": 0.020904541015625, + "learning_rate": 7.558005869116416e-07, + "loss": 0.1302, + "num_tokens": 1124852385.0, + "reward": 1.6049107313156128, + "reward_std": 0.22037231922149658, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19644489884376526, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1034.810302734375, + "completions/mean_terminated_length": 856.6378173828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.41446912791007406, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12173272089035926, + "kl": 0.017486572265625, + "learning_rate": 7.555028243946462e-07, + "loss": 0.0594, + "num_tokens": 1125389708.0, + "reward": 1.4642857313156128, + "reward_std": 0.3436257541179657, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.2068338543176651, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1104.75, + "completions/mean_terminated_length": 864.313720703125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.41468222257738, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11731568543574519, + "kl": 0.01654052734375, + "learning_rate": 7.552049481475674e-07, + "loss": 0.0976, + "num_tokens": 1125953708.0, + "reward": 1.3856027126312256, + "reward_std": 0.27327868342399597, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.19285248219966888, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1063.482177734375, + "completions/mean_terminated_length": 832.9476928710938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.414895317244686, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1302797180316065, + "kl": 0.01806640625, + "learning_rate": 7.54906958335257e-07, + "loss": 0.0381, + "num_tokens": 1126498932.0, + "reward": 1.5368304252624512, + "reward_std": 0.2780376672744751, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17065013945102692, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1128.3013916015625, + "completions/mean_terminated_length": 857.17626953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4151084119119919, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12062219590206154, + "kl": 0.017547607421875, + "learning_rate": 7.546088551226294e-07, + "loss": 0.0467, + "num_tokens": 1127072187.0, + "reward": 1.4497768878936768, + "reward_std": 0.3176611661911011, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1810987889766693, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1018.7857666015625, + "completions/mean_terminated_length": 847.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.41532150657929784, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13871092067936677, + "kl": 0.019622802734375, + "learning_rate": 7.543106386746619e-07, + "loss": 0.0591, + "num_tokens": 1127594075.0, + "reward": 1.3934152126312256, + "reward_std": 0.27748817205429077, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.16975806653499603, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1171.1875, + "completions/mean_terminated_length": 941.4873046875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4155346012466038, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2833425937338556, + "kl": 0.0170440673828125, + "learning_rate": 7.540123091563947e-07, + "loss": 0.0874, + "num_tokens": 1128193791.0, + "reward": 1.4375001192092896, + "reward_std": 0.31718990206718445, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.1964321881532669, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1016.700927734375, + "completions/mean_terminated_length": 806.00537109375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.41574769591390975, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12241231150081495, + "kl": 0.021484375, + "learning_rate": 7.537138667329302e-07, + "loss": 0.0955, + "num_tokens": 1128715785.0, + "reward": 1.5742188692092896, + "reward_std": 0.2853057384490967, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15551921725273132, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1203.185302734375, + "completions/mean_terminated_length": 921.5803833007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4159607905812157, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10668216636883829, + "kl": 0.016510009765625, + "learning_rate": 7.534153115694332e-07, + "loss": 0.0617, + "num_tokens": 1129334956.0, + "reward": 1.3666294813156128, + "reward_std": 0.31266745924949646, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19121423363685608, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1099.321533203125, + "completions/mean_terminated_length": 908.5684204101562, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.41617388524852167, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11651819198423108, + "kl": 0.0162353515625, + "learning_rate": 7.531166438311314e-07, + "loss": 0.0126, + "num_tokens": 1129894780.0, + "reward": 1.4492188692092896, + "reward_std": 0.3175099790096283, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.1619679182767868, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1059.227783203125, + "completions/mean_terminated_length": 831.0494995117188, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4163869799158276, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1270686075033123, + "kl": 0.018096923828125, + "learning_rate": 7.528178636833145e-07, + "loss": 0.0965, + "num_tokens": 1130441410.0, + "reward": 1.4196429252624512, + "reward_std": 0.2730393707752228, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.1719365119934082, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1023.88623046875, + "completions/mean_terminated_length": 791.0054931640625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4166000745831336, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 41.52921408668382, + "kl": 3.1875, + "learning_rate": 7.525189712913346e-07, + "loss": 0.2041, + "num_tokens": 1130974031.0, + "reward": 1.3883929252624512, + "reward_std": 0.31834521889686584, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9129464030265808, + "rewards/tag_count_reward/std": 0.24453723430633545, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1060.21435546875, + "completions/mean_terminated_length": 835.5945434570312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4168131692504395, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.47089650360723145, + "kl": 0.018890380859375, + "learning_rate": 7.522199668206056e-07, + "loss": 0.1222, + "num_tokens": 1131527151.0, + "reward": 1.5217634439468384, + "reward_std": 0.2847650945186615, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16055120527744293, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 945.4241333007812, + "completions/mean_terminated_length": 794.3096313476562, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.41702626391774544, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13266296706410222, + "kl": 0.021728515625, + "learning_rate": 7.519208504366035e-07, + "loss": 0.0605, + "num_tokens": 1132018781.0, + "reward": 1.5658482313156128, + "reward_std": 0.29443827271461487, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.14178510010242462, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1103.0848388671875, + "completions/mean_terminated_length": 831.5574951171875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.4172393585850514, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1234201463301303, + "kl": 0.01806640625, + "learning_rate": 7.516216223048663e-07, + "loss": 0.0782, + "num_tokens": 1132581107.0, + "reward": 1.4497768878936768, + "reward_std": 0.3119356334209442, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1740114688873291, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1184.625, + "completions/mean_terminated_length": 917.0292358398438, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.41745245325235736, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1325135093578888, + "kl": 0.020782470703125, + "learning_rate": 7.513222825909942e-07, + "loss": 0.0257, + "num_tokens": 1133190603.0, + "reward": 1.32421875, + "reward_std": 0.31454217433929443, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.2270708829164505, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 982.0960083007812, + "completions/mean_terminated_length": 777.9866943359375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.4176655479196633, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13794098097949525, + "kl": 0.020111083984375, + "learning_rate": 7.510228314606484e-07, + "loss": 0.0564, + "num_tokens": 1133702182.0, + "reward": 1.4252232313156128, + "reward_std": 0.3640156090259552, + "rewards/accuracy_reward/mean": 0.49537035822868347, + "rewards/accuracy_reward/std": 0.5005581974983215, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18416117131710052, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1165.388427734375, + "completions/mean_terminated_length": 891.8304443359375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4178786425869693, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12164675538176777, + "kl": 0.017425537109375, + "learning_rate": 7.507232690795525e-07, + "loss": 0.1106, + "num_tokens": 1134294964.0, + "reward": 1.4202009439468384, + "reward_std": 0.38431569933891296, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18972641229629517, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 910.8616333007812, + "completions/mean_terminated_length": 748.4132690429688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.41809173725427523, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13209947562323152, + "kl": 0.021209716796875, + "learning_rate": 7.504235956134911e-07, + "loss": 0.0757, + "num_tokens": 1134771094.0, + "reward": 1.5803571939468384, + "reward_std": 0.3169344961643219, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18066051602363586, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1074.1585693359375, + "completions/mean_terminated_length": 878.3458862304688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4183048319215812, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13444507963379274, + "kl": 0.01910400390625, + "learning_rate": 7.501238112283109e-07, + "loss": 0.141, + "num_tokens": 1135320349.0, + "reward": 1.5658482313156128, + "reward_std": 0.3748548924922943, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.17649045586585999, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1048.930908203125, + "completions/mean_terminated_length": 838.3162231445312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.4185179265888871, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11863590802955154, + "kl": 0.019378662109375, + "learning_rate": 7.49823916089919e-07, + "loss": 0.109, + "num_tokens": 1135859838.0, + "reward": 1.657366156578064, + "reward_std": 0.35045763850212097, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 0.45011183619499207, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19175945222377777, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1060.5692138671875, + "completions/mean_terminated_length": 855.6307373046875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.41873102125619305, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12687230708668706, + "kl": 0.01910400390625, + "learning_rate": 7.495239103642849e-07, + "loss": 0.0712, + "num_tokens": 1136398029.0, + "reward": 1.4720982313156128, + "reward_std": 0.35651785135269165, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17213605344295502, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1001.3460083007812, + "completions/mean_terminated_length": 854.86767578125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.418944115923499, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13707076088885917, + "kl": 0.018890380859375, + "learning_rate": 7.492237942174387e-07, + "loss": 0.068, + "num_tokens": 1136917496.0, + "reward": 1.5039063692092896, + "reward_std": 0.3045034408569336, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17107665538787842, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1026.74560546875, + "completions/mean_terminated_length": 811.4541015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.41915721059080496, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11670777666372437, + "kl": 0.02032470703125, + "learning_rate": 7.489235678154718e-07, + "loss": 0.0766, + "num_tokens": 1137447798.0, + "reward": 1.5379464626312256, + "reward_std": 0.2798476219177246, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17063917219638824, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1133.8929443359375, + "completions/mean_terminated_length": 850.5731201171875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4193703052581109, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13003822700247553, + "kl": 0.0196533203125, + "learning_rate": 7.486232313245362e-07, + "loss": 0.0691, + "num_tokens": 1138022422.0, + "reward": 1.4525669813156128, + "reward_std": 0.31875282526016235, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9146205186843872, + "rewards/tag_count_reward/std": 0.23133563995361328, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1124.5982666015625, + "completions/mean_terminated_length": 911.5054931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.4195833999254169, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12400522762694725, + "kl": 0.017333984375, + "learning_rate": 7.483227849108455e-07, + "loss": 0.0996, + "num_tokens": 1138591650.0, + "reward": 1.4497768878936768, + "reward_std": 0.363167405128479, + "rewards/accuracy_reward/mean": 0.5162037014961243, + "rewards/accuracy_reward/std": 0.5003167986869812, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17032796144485474, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1005.4420166015625, + "completions/mean_terminated_length": 825.3141479492188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.41979649459272284, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12795603506722777, + "kl": 0.02056884765625, + "learning_rate": 7.480222287406737e-07, + "loss": 0.0584, + "num_tokens": 1139115816.0, + "reward": 1.497209906578064, + "reward_std": 0.2912636399269104, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15101487934589386, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1050.7545166015625, + "completions/mean_terminated_length": 875.3858032226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4200095892600288, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11754554426197114, + "kl": 0.018280029296875, + "learning_rate": 7.477215629803555e-07, + "loss": 0.0588, + "num_tokens": 1139662618.0, + "reward": 1.5463169813156128, + "reward_std": 0.3298322856426239, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.15702906250953674, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 996.66748046875, + "completions/mean_terminated_length": 855.6025390625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4202226839273347, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11588709542951056, + "kl": 0.01788330078125, + "learning_rate": 7.474207877962866e-07, + "loss": 0.0479, + "num_tokens": 1140177109.0, + "reward": 1.4648438692092896, + "reward_std": 0.28817933797836304, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14392071962356567, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1120.765625, + "completions/mean_terminated_length": 884.4118041992188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.42043577859464065, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11929480171024315, + "kl": 0.0176239013671875, + "learning_rate": 7.471199033549228e-07, + "loss": 0.0589, + "num_tokens": 1140747244.0, + "reward": 1.4458706378936768, + "reward_std": 0.3368956446647644, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17670516669750214, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1132.2366943359375, + "completions/mean_terminated_length": 895.5786743164062, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4206488732619466, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11648480572944128, + "kl": 0.018585205078125, + "learning_rate": 7.468189098227809e-07, + "loss": 0.0627, + "num_tokens": 1141320438.0, + "reward": 1.4977679252624512, + "reward_std": 0.33273303508758545, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.17423014342784882, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1084.4241943359375, + "completions/mean_terminated_length": 811.0888671875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.42086196792925257, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1190998217571097, + "kl": 0.017852783203125, + "learning_rate": 7.465178073664373e-07, + "loss": 0.1049, + "num_tokens": 1141872100.0, + "reward": 1.5039063692092896, + "reward_std": 0.3167956471443176, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16314294934272766, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1058.57373046875, + "completions/mean_terminated_length": 820.1246337890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4210750625965585, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12479343437162574, + "kl": 0.0181884765625, + "learning_rate": 7.462165961525298e-07, + "loss": 0.0876, + "num_tokens": 1142413205.0, + "reward": 1.532366156578064, + "reward_std": 0.2935563623905182, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.19385728240013123, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1071.1317138671875, + "completions/mean_terminated_length": 835.7091064453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4212881572638645, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1327260926365753, + "kl": 0.019073486328125, + "learning_rate": 7.459152763477552e-07, + "loss": 0.0928, + "num_tokens": 1142966272.0, + "reward": 1.450334906578064, + "reward_std": 0.32430052757263184, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18636047840118408, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 983.6585083007812, + "completions/mean_terminated_length": 803.026123046875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.42150125193117044, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11774500868802211, + "kl": 0.02117919921875, + "learning_rate": 7.456138481188713e-07, + "loss": 0.043, + "num_tokens": 1143480343.0, + "reward": 1.5351563692092896, + "reward_std": 0.2810564637184143, + "rewards/accuracy_reward/mean": 0.6064814925193787, + "rewards/accuracy_reward/std": 0.4890965521335602, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17189201712608337, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 957.6652221679688, + "completions/mean_terminated_length": 850.7696533203125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4217143465984764, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13647354888474048, + "kl": 0.020111083984375, + "learning_rate": 7.453123116326955e-07, + "loss": 0.0658, + "num_tokens": 1143986529.0, + "reward": 1.5446429252624512, + "reward_std": 0.26959431171417236, + "rewards/accuracy_reward/mean": 0.6064814925193787, + "rewards/accuracy_reward/std": 0.4890965521335602, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.15285643935203552, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1085.3951416015625, + "completions/mean_terminated_length": 866.5014038085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4219274412657823, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12794814428472606, + "kl": 0.0171966552734375, + "learning_rate": 7.450106670561049e-07, + "loss": 0.1046, + "num_tokens": 1144553058.0, + "reward": 1.3621652126312256, + "reward_std": 0.35093072056770325, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.18826662003993988, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 951.3973388671875, + "completions/mean_terminated_length": 768.6302490234375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.42214053593308826, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1332421447932563, + "kl": 0.021636962890625, + "learning_rate": 7.44708914556037e-07, + "loss": 0.1182, + "num_tokens": 1145049684.0, + "reward": 1.5418527126312256, + "reward_std": 0.3753419816493988, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.19447050988674164, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1060.4888916015625, + "completions/mean_terminated_length": 889.8717651367188, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4223536306003942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12549930456168193, + "kl": 0.019744873046875, + "learning_rate": 7.444070542994886e-07, + "loss": 0.0739, + "num_tokens": 1145597439.0, + "reward": 1.5747768878936768, + "reward_std": 0.35021594166755676, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.15552422404289246, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1005.0803833007812, + "completions/mean_terminated_length": 821.6798095703125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.42256672526770017, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1237000475195039, + "kl": 0.018402099609375, + "learning_rate": 7.441050864535161e-07, + "loss": 0.051, + "num_tokens": 1146110611.0, + "reward": 1.4453126192092896, + "reward_std": 0.34513217210769653, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1748131364583969, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 991.560302734375, + "completions/mean_terminated_length": 815.4869995117188, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.42277981993500613, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11668856977376964, + "kl": 0.020263671875, + "learning_rate": 7.438030111852359e-07, + "loss": 0.1034, + "num_tokens": 1146622206.0, + "reward": 1.5318081378936768, + "reward_std": 0.31090766191482544, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1423770934343338, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1037.3929443359375, + "completions/mean_terminated_length": 810.97265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4229929146023121, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13429306115904246, + "kl": 0.017608642578125, + "learning_rate": 7.435008286618234e-07, + "loss": 0.1738, + "num_tokens": 1147164030.0, + "reward": 1.520647406578064, + "reward_std": 0.3790675103664398, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18750248849391937, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 943.8683471679688, + "completions/mean_terminated_length": 756.4830322265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.42320600926961804, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13749421301061082, + "kl": 0.020660400390625, + "learning_rate": 7.431985390505134e-07, + "loss": 0.055, + "num_tokens": 1147653939.0, + "reward": 1.4397321939468384, + "reward_std": 0.23235628008842468, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.13773420453071594, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1083.0045166015625, + "completions/mean_terminated_length": 873.2228393554688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.423419103936924, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11362423124964294, + "kl": 0.0174560546875, + "learning_rate": 7.428961425186002e-07, + "loss": 0.0607, + "num_tokens": 1148203141.0, + "reward": 1.5239956378936768, + "reward_std": 0.29532772302627563, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.14691685140132904, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 990.4777221679688, + "completions/mean_terminated_length": 854.6246337890625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.4236321986042299, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11907821825803847, + "kl": 0.01910400390625, + "learning_rate": 7.425936392334368e-07, + "loss": 0.0932, + "num_tokens": 1148715003.0, + "reward": 1.6668527126312256, + "reward_std": 0.31848883628845215, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.45739173889160156, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14266617596149445, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1161.6451416015625, + "completions/mean_terminated_length": 926.2853393554688, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.42384529327153586, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.9161053668214424, + "kl": 0.032989501953125, + "learning_rate": 7.42291029362436e-07, + "loss": 0.059, + "num_tokens": 1149301356.0, + "reward": 1.3939732313156128, + "reward_std": 0.37136876583099365, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17708362638950348, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1083.154052734375, + "completions/mean_terminated_length": 798.7196655273438, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4240583879388418, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12687526118672238, + "kl": 0.018646240234375, + "learning_rate": 7.419883130730691e-07, + "loss": 0.0219, + "num_tokens": 1149857345.0, + "reward": 1.387834906578064, + "reward_std": 0.28479573130607605, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.19084173440933228, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1039.2835693359375, + "completions/mean_terminated_length": 826.6351318359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4242714826061478, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13878997570498752, + "kl": 0.020416259765625, + "learning_rate": 7.416854905328664e-07, + "loss": 0.087, + "num_tokens": 1150388160.0, + "reward": 1.4168527126312256, + "reward_std": 0.3089248239994049, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.19362127780914307, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1129.7076416015625, + "completions/mean_terminated_length": 852.0842895507812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.42448457727345373, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11262305235215861, + "kl": 0.0171356201171875, + "learning_rate": 7.41382561909417e-07, + "loss": 0.0245, + "num_tokens": 1150957037.0, + "reward": 1.4352679252624512, + "reward_std": 0.3097078502178192, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1884397715330124, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1129.60498046875, + "completions/mean_terminated_length": 923.84423828125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.4246976719407597, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12579420887970502, + "kl": 0.01806640625, + "learning_rate": 7.410795273703685e-07, + "loss": 0.0862, + "num_tokens": 1151539900.0, + "reward": 1.3694196939468384, + "reward_std": 0.3807280659675598, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.2142428457736969, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 987.8973388671875, + "completions/mean_terminated_length": 833.35546875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.42491076660806565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12536190974543424, + "kl": 0.02001953125, + "learning_rate": 7.407763870834275e-07, + "loss": 0.1066, + "num_tokens": 1152047118.0, + "reward": 1.618303656578064, + "reward_std": 0.36230963468551636, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1615753173828125, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 916.5558471679688, + "completions/mean_terminated_length": 748.289794921875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.4251238612753716, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14486461726569597, + "kl": 0.019439697265625, + "learning_rate": 7.40473141216359e-07, + "loss": 0.1041, + "num_tokens": 1152526279.0, + "reward": 1.481584906578064, + "reward_std": 0.3407151401042938, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17107665538787842, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1089.665283203125, + "completions/mean_terminated_length": 852.0835571289062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4253369559426775, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1179002351996685, + "kl": 0.018524169921875, + "learning_rate": 7.401697899369863e-07, + "loss": 0.0471, + "num_tokens": 1153082705.0, + "reward": 1.4263393878936768, + "reward_std": 0.2941608428955078, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.17920349538326263, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1072.46875, + "completions/mean_terminated_length": 885.6648559570312, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.42555005060998347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12089811651514756, + "kl": 0.018157958984375, + "learning_rate": 7.398663334131913e-07, + "loss": 0.0221, + "num_tokens": 1153643779.0, + "reward": 1.4955357313156128, + "reward_std": 0.3620457351207733, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1853403002023697, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1070.0960693359375, + "completions/mean_terminated_length": 799.8489990234375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.4257631452772894, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12848578540188574, + "kl": 0.020416259765625, + "learning_rate": 7.395627718129136e-07, + "loss": 0.0874, + "num_tokens": 1154191758.0, + "reward": 1.4045759439468384, + "reward_std": 0.33130455017089844, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.15974830090999603, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 955.8638916015625, + "completions/mean_terminated_length": 757.0316772460938, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4259762399445954, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13618537501844835, + "kl": 0.02056884765625, + "learning_rate": 7.392591053041516e-07, + "loss": 0.104, + "num_tokens": 1154686001.0, + "reward": 1.532366156578064, + "reward_std": 0.3016572892665863, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.16701211035251617, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1081.5357666015625, + "completions/mean_terminated_length": 824.9039306640625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.42618933461190134, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1258082510947054, + "kl": 0.01617431640625, + "learning_rate": 7.389553340549612e-07, + "loss": 0.0431, + "num_tokens": 1155245553.0, + "reward": 1.4079241752624512, + "reward_std": 0.2892843186855316, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1900748312473297, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1055.4442138671875, + "completions/mean_terminated_length": 849.4420166015625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4264024292792073, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12357926587036605, + "kl": 0.01995849609375, + "learning_rate": 7.386514582334569e-07, + "loss": 0.0335, + "num_tokens": 1155793176.0, + "reward": 1.4709821939468384, + "reward_std": 0.3795382082462311, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19061346352100372, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1066.2098388671875, + "completions/mean_terminated_length": 812.48876953125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.42661552394651325, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12394146871752577, + "kl": 0.019683837890625, + "learning_rate": 7.383474780078104e-07, + "loss": 0.0991, + "num_tokens": 1156339782.0, + "reward": 1.4988839626312256, + "reward_std": 0.2770690321922302, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.194654181599617, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1065.4129638671875, + "completions/mean_terminated_length": 877.2579345703125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.4268286186138192, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12136924180244796, + "kl": 0.018798828125, + "learning_rate": 7.380433935462517e-07, + "loss": 0.0727, + "num_tokens": 1156892271.0, + "reward": 1.4603794813156128, + "reward_std": 0.30810728669166565, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18297357857227325, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 940.41748046875, + "completions/mean_terminated_length": 749.0549926757812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4270417132811251, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1378721192781175, + "kl": 0.021697998046875, + "learning_rate": 7.377392050170679e-07, + "loss": 0.125, + "num_tokens": 1157385594.0, + "reward": 1.4860491752624512, + "reward_std": 0.32827243208885193, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19084827601909637, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1046.669677734375, + "completions/mean_terminated_length": 822.3278198242188, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.42725480794843107, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11664888552883391, + "kl": 0.020172119140625, + "learning_rate": 7.374349125886046e-07, + "loss": 0.0578, + "num_tokens": 1157920134.0, + "reward": 1.3532366752624512, + "reward_std": 0.3162634074687958, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19194407761096954, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1125.2098388671875, + "completions/mean_terminated_length": 870.1937255859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.427467902615737, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11551836673067618, + "kl": 0.0179443359375, + "learning_rate": 7.37130516429264e-07, + "loss": 0.0957, + "num_tokens": 1158495684.0, + "reward": 1.4168527126312256, + "reward_std": 0.3748857080936432, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.20008359849452972, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1173.5379638671875, + "completions/mean_terminated_length": 941.336181640625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.427680997283043, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10907505663280373, + "kl": 0.01556396484375, + "learning_rate": 7.368260167075061e-07, + "loss": 0.0689, + "num_tokens": 1159089429.0, + "reward": 1.4090402126312256, + "reward_std": 0.36002999544143677, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.18208445608615875, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1072.796875, + "completions/mean_terminated_length": 870.3961791992188, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.42789409195034894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12994434872577318, + "kl": 0.0189208984375, + "learning_rate": 7.365214135918485e-07, + "loss": 0.122, + "num_tokens": 1159641050.0, + "reward": 1.4391741752624512, + "reward_std": 0.41247203946113586, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.1966029405593872, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1002.7433471679688, + "completions/mean_terminated_length": 812.4459228515625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.4281071866176549, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13513243794580246, + "kl": 0.020660400390625, + "learning_rate": 7.362167072508652e-07, + "loss": 0.0955, + "num_tokens": 1160163303.0, + "reward": 1.4235491752624512, + "reward_std": 0.3787241578102112, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9079241156578064, + "rewards/tag_count_reward/std": 0.2347799688577652, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 975.7210083007812, + "completions/mean_terminated_length": 803.4896240234375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.42832028128496086, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13040082486508398, + "kl": 0.020111083984375, + "learning_rate": 7.359118978531883e-07, + "loss": 0.0823, + "num_tokens": 1160662954.0, + "reward": 1.5731027126312256, + "reward_std": 0.2644284963607788, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.14470793306827545, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1021.2723388671875, + "completions/mean_terminated_length": 811.5107421875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4285333759522668, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12310608045039467, + "kl": 0.018951416015625, + "learning_rate": 7.356069855675061e-07, + "loss": 0.0584, + "num_tokens": 1161185492.0, + "reward": 1.481584906578064, + "reward_std": 0.3446776270866394, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1390950083732605, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1105.2388916015625, + "completions/mean_terminated_length": 854.901123046875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4287464706195728, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11552040829679966, + "kl": 0.016998291015625, + "learning_rate": 7.353019705625645e-07, + "loss": 0.0958, + "num_tokens": 1161753599.0, + "reward": 1.4129464626312256, + "reward_std": 0.34539273381233215, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.20407520234584808, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1007.8348388671875, + "completions/mean_terminated_length": 785.1436767578125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4289595652868787, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12037030021667264, + "kl": 0.017822265625, + "learning_rate": 7.349968530071658e-07, + "loss": 0.073, + "num_tokens": 1162271909.0, + "reward": 1.5195313692092896, + "reward_std": 0.2649814486503601, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.15725943446159363, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1021.3460083007812, + "completions/mean_terminated_length": 814.9142456054688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.42917265995418463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1220540121741869, + "kl": 0.017608642578125, + "learning_rate": 7.346916330701693e-07, + "loss": 0.0484, + "num_tokens": 1162808448.0, + "reward": 1.4414063692092896, + "reward_std": 0.37261244654655457, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19942738115787506, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1046.88623046875, + "completions/mean_terminated_length": 842.3575439453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4293857546214906, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12212174750647749, + "kl": 0.01708984375, + "learning_rate": 7.343863109204909e-07, + "loss": 0.0924, + "num_tokens": 1163341693.0, + "reward": 1.3945313692092896, + "reward_std": 0.302133709192276, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17378443479537964, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1000.4777221679688, + "completions/mean_terminated_length": 799.8882446289062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.42959884928879655, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1277072093180643, + "kl": 0.01904296875, + "learning_rate": 7.34080886727103e-07, + "loss": 0.0691, + "num_tokens": 1163856915.0, + "reward": 1.5680804252624512, + "reward_std": 0.2932305932044983, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1540725976228714, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1050.8773193359375, + "completions/mean_terminated_length": 850.3834228515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4298119439561025, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12653991887985958, + "kl": 0.018463134765625, + "learning_rate": 7.337753606590344e-07, + "loss": 0.1054, + "num_tokens": 1164400988.0, + "reward": 1.4748884439468384, + "reward_std": 0.295178085565567, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17706511914730072, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 985.47998046875, + "completions/mean_terminated_length": 845.9570922851562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.43002503862340846, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11480390835647009, + "kl": 0.019256591796875, + "learning_rate": 7.334697328853706e-07, + "loss": 0.0401, + "num_tokens": 1164909043.0, + "reward": 1.4916294813156128, + "reward_std": 0.32447755336761475, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.14720547199249268, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 977.4107666015625, + "completions/mean_terminated_length": 815.0333862304688, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4302381332907144, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13233413273450828, + "kl": 0.018798828125, + "learning_rate": 7.331640035752528e-07, + "loss": 0.0642, + "num_tokens": 1165410907.0, + "reward": 1.5005581378936768, + "reward_std": 0.3394905626773834, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.17984239757061005, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1026.341552734375, + "completions/mean_terminated_length": 852.9530029296875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.4304512279580204, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12585713938985338, + "kl": 0.0206298828125, + "learning_rate": 7.328581728978792e-07, + "loss": 0.0762, + "num_tokens": 1165940420.0, + "reward": 1.5106027126312256, + "reward_std": 0.3467922508716583, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.17637281119823456, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1086.71875, + "completions/mean_terminated_length": 788.7777709960938, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.4306643226253263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.127654545678264, + "kl": 0.017822265625, + "learning_rate": 7.325522410225035e-07, + "loss": 0.0884, + "num_tokens": 1166499094.0, + "reward": 1.4882813692092896, + "reward_std": 0.38874325156211853, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.20279188454151154, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1050.25, + "completions/mean_terminated_length": 871.705322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.43087741729263224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12945923712176072, + "kl": 0.021331787109375, + "learning_rate": 7.322462081184355e-07, + "loss": 0.0495, + "num_tokens": 1167042374.0, + "reward": 1.5295759439468384, + "reward_std": 0.3477681577205658, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.1743152141571045, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 996.950927734375, + "completions/mean_terminated_length": 828.1295166015625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.4310905119599382, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1600873446032268, + "kl": 0.020782470703125, + "learning_rate": 7.319400743550411e-07, + "loss": 0.0755, + "num_tokens": 1167562304.0, + "reward": 1.4414063692092896, + "reward_std": 0.30521509051322937, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.20611964166164398, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1025.805908203125, + "completions/mean_terminated_length": 806.9620971679688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.43130360662724415, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14637321611767606, + "kl": 0.02008056640625, + "learning_rate": 7.316338399017419e-07, + "loss": 0.071, + "num_tokens": 1168104873.0, + "reward": 1.4910714626312256, + "reward_std": 0.25788840651512146, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.1845843642950058, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1034.263427734375, + "completions/mean_terminated_length": 833.6845092773438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4315167012945501, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12195401875072376, + "kl": 0.018585205078125, + "learning_rate": 7.313275049280152e-07, + "loss": 0.1058, + "num_tokens": 1168633519.0, + "reward": 1.528459906578064, + "reward_std": 0.32592836022377014, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.18088066577911377, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 987.1563110351562, + "completions/mean_terminated_length": 797.3211059570312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.43172979596185607, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13299690038928727, + "kl": 0.02044677734375, + "learning_rate": 7.310210696033939e-07, + "loss": 0.0589, + "num_tokens": 1169144405.0, + "reward": 1.4235491752624512, + "reward_std": 0.29363900423049927, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18336887657642365, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1074.5692138671875, + "completions/mean_terminated_length": 888.1675415039062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.431942890629162, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12506611955202673, + "kl": 0.02008056640625, + "learning_rate": 7.307145340974666e-07, + "loss": 0.0783, + "num_tokens": 1169698132.0, + "reward": 1.4843751192092896, + "reward_std": 0.34263327717781067, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.16360238194465637, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 974.5491333007812, + "completions/mean_terminated_length": 782.4579467773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.432155985296468, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1376999212339923, + "kl": 0.02020263671875, + "learning_rate": 7.304078985798773e-07, + "loss": 0.0921, + "num_tokens": 1170198106.0, + "reward": 1.4592634439468384, + "reward_std": 0.34764721989631653, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.1878284513950348, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1182.671875, + "completions/mean_terminated_length": 858.83740234375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4323690799637739, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12228657128802851, + "kl": 0.018035888671875, + "learning_rate": 7.30101163220325e-07, + "loss": 0.0664, + "num_tokens": 1170800055.0, + "reward": 1.328125, + "reward_std": 0.3932945430278778, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.24831648170948029, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1031.10498046875, + "completions/mean_terminated_length": 849.13427734375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.43258217463107984, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1395244118199411, + "kl": 0.021942138671875, + "learning_rate": 7.297943281885644e-07, + "loss": 0.0862, + "num_tokens": 1171322118.0, + "reward": 1.4397321939468384, + "reward_std": 0.25645577907562256, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19568081200122833, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1008.1495971679688, + "completions/mean_terminated_length": 809.0292358398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4327952692983858, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13529758386248378, + "kl": 0.020904541015625, + "learning_rate": 7.294873936544054e-07, + "loss": 0.0859, + "num_tokens": 1171848009.0, + "reward": 1.547991156578064, + "reward_std": 0.2893456518650055, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.4822678565979004, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2177339345216751, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1153.493408203125, + "completions/mean_terminated_length": 889.7947998046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.43300836396569176, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.2427688975096126, + "kl": 0.04595947265625, + "learning_rate": 7.291803597877126e-07, + "loss": 0.1224, + "num_tokens": 1172440262.0, + "reward": 1.3459821939468384, + "reward_std": 0.34488779306411743, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9040178656578064, + "rewards/tag_count_reward/std": 0.23049293458461761, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1082.28125, + "completions/mean_terminated_length": 797.589599609375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4332214586329977, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13455096043180642, + "kl": 0.018768310546875, + "learning_rate": 7.288732267584058e-07, + "loss": 0.12, + "num_tokens": 1172989508.0, + "reward": 1.4983259439468384, + "reward_std": 0.3553817570209503, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9112723469734192, + "rewards/tag_count_reward/std": 0.22639364004135132, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1058.5223388671875, + "completions/mean_terminated_length": 859.5657348632812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.43343455330030367, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.4201079099197848, + "kl": 0.04888916015625, + "learning_rate": 7.285659947364592e-07, + "loss": 0.0563, + "num_tokens": 1173540046.0, + "reward": 1.4916294813156128, + "reward_std": 0.34383928775787354, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.20298877358436584, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 984.575927734375, + "completions/mean_terminated_length": 813.766845703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.43364764796760963, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14342454283568976, + "kl": 0.020294189453125, + "learning_rate": 7.28258663891903e-07, + "loss": 0.0836, + "num_tokens": 1174047920.0, + "reward": 1.4642857313156128, + "reward_std": 0.3623078763484955, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.18918029963970184, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1103.93310546875, + "completions/mean_terminated_length": 879.6519775390625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4338607426349156, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12177834845138455, + "kl": 0.016693115234375, + "learning_rate": 7.279512343948207e-07, + "loss": 0.0591, + "num_tokens": 1174617410.0, + "reward": 1.4838169813156128, + "reward_std": 0.3620666265487671, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.19157950580120087, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1016.2745971679688, + "completions/mean_terminated_length": 821.9708251953125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4340738373022215, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12778408639132097, + "kl": 0.020233154296875, + "learning_rate": 7.276437064153513e-07, + "loss": 0.0931, + "num_tokens": 1175146653.0, + "reward": 1.5842634439468384, + "reward_std": 0.3446699380874634, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.1929689645767212, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1051.3348388671875, + "completions/mean_terminated_length": 831.3623657226562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.43428693196952745, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1240360589442704, + "kl": 0.019622802734375, + "learning_rate": 7.273360801236876e-07, + "loss": 0.1176, + "num_tokens": 1175677027.0, + "reward": 1.5641741752624512, + "reward_std": 0.3642991781234741, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.92578125, + "rewards/tag_count_reward/std": 0.21464671194553375, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1067.19873046875, + "completions/mean_terminated_length": 824.04736328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4345000266368334, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1220292390746561, + "kl": 0.017547607421875, + "learning_rate": 7.270283556900776e-07, + "loss": 0.0788, + "num_tokens": 1176215644.0, + "reward": 1.3783482313156128, + "reward_std": 0.3106710612773895, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18725347518920898, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 961.9308471679688, + "completions/mean_terminated_length": 840.6575317382812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.43471312130413936, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12516932710271833, + "kl": 0.02056884765625, + "learning_rate": 7.267205332848231e-07, + "loss": 0.0519, + "num_tokens": 1176711229.0, + "reward": 1.6467634439468384, + "reward_std": 0.3304532766342163, + "rewards/accuracy_reward/mean": 0.6919642686843872, + "rewards/accuracy_reward/std": 0.46219751238822937, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.1490427851676941, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1014.2522583007812, + "completions/mean_terminated_length": 819.567626953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4349262159714453, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12650198826089648, + "kl": 0.01873779296875, + "learning_rate": 7.264126130782803e-07, + "loss": 0.0512, + "num_tokens": 1177228654.0, + "reward": 1.4827009439468384, + "reward_std": 0.25383102893829346, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.15974830090999603, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1123.8638916015625, + "completions/mean_terminated_length": 865.105712890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4351393106387513, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12545230015232006, + "kl": 0.01751708984375, + "learning_rate": 7.261045952408593e-07, + "loss": 0.0794, + "num_tokens": 1177798209.0, + "reward": 1.4056919813156128, + "reward_std": 0.3215946555137634, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18260477483272552, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1020.07373046875, + "completions/mean_terminated_length": 823.2366943359375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.43535240530605723, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13670143417797545, + "kl": 0.018951416015625, + "learning_rate": 7.257964799430245e-07, + "loss": 0.0433, + "num_tokens": 1178327890.0, + "reward": 1.3950893878936768, + "reward_std": 0.2512245178222656, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1724584698677063, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 997.4933471679688, + "completions/mean_terminated_length": 789.6390380859375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4355654999733632, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12882352029113336, + "kl": 0.020172119140625, + "learning_rate": 7.254882673552942e-07, + "loss": 0.0843, + "num_tokens": 1178839279.0, + "reward": 1.5273438692092896, + "reward_std": 0.2648398280143738, + "rewards/accuracy_reward/mean": 0.6180555820465088, + "rewards/accuracy_reward/std": 0.48642635345458984, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.19038327038288116, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1101.024658203125, + "completions/mean_terminated_length": 907.5564575195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4357785946406691, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13585145716587707, + "kl": 0.019927978515625, + "learning_rate": 7.251799576482403e-07, + "loss": 0.0966, + "num_tokens": 1179411290.0, + "reward": 1.4162946939468384, + "reward_std": 0.38530245423316956, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9207589030265808, + "rewards/tag_count_reward/std": 0.2189916968345642, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 932.2857666015625, + "completions/mean_terminated_length": 739.518310546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.43599168930797505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1301197931461155, + "kl": 0.021453857421875, + "learning_rate": 7.248715509924888e-07, + "loss": 0.0364, + "num_tokens": 1179891946.0, + "reward": 1.481584906578064, + "reward_std": 0.2887004613876343, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.14997799694538116, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1151.149658203125, + "completions/mean_terminated_length": 880.0087280273438, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.436204783975281, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12083018928102943, + "kl": 0.0165557861328125, + "learning_rate": 7.24563047558719e-07, + "loss": 0.1142, + "num_tokens": 1180480109.0, + "reward": 1.4252232313156128, + "reward_std": 0.3353671431541443, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.2101718634366989, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1073.8460693359375, + "completions/mean_terminated_length": 849.041259765625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.43641787864258696, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1186508653277658, + "kl": 0.0177001953125, + "learning_rate": 7.242544475176642e-07, + "loss": 0.0815, + "num_tokens": 1181025640.0, + "reward": 1.5195313692092896, + "reward_std": 0.2967233657836914, + "rewards/accuracy_reward/mean": 0.5856481194496155, + "rewards/accuracy_reward/std": 0.49318093061447144, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16569413244724274, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1011.0245971679688, + "completions/mean_terminated_length": 831.8612670898438, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4366309733098929, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1434211598673814, + "kl": 0.018829345703125, + "learning_rate": 7.239457510401106e-07, + "loss": 0.1329, + "num_tokens": 1181549443.0, + "reward": 1.4715402126312256, + "reward_std": 0.33091679215431213, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.19411711394786835, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 859.1295166015625, + "completions/mean_terminated_length": 729.6484985351562, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4368440679771989, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1208979635298343, + "kl": 0.0218505859375, + "learning_rate": 7.236369582968981e-07, + "loss": 0.071, + "num_tokens": 1181994749.0, + "reward": 1.6668527126312256, + "reward_std": 0.2530975043773651, + "rewards/accuracy_reward/mean": 0.7361111044883728, + "rewards/accuracy_reward/std": 0.4412507712841034, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.15224164724349976, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1162.71435546875, + "completions/mean_terminated_length": 874.6035766601562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.43705716264450484, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1178898340547743, + "kl": 0.0163116455078125, + "learning_rate": 7.233280694589202e-07, + "loss": 0.0827, + "num_tokens": 1182589277.0, + "reward": 1.4525669813156128, + "reward_std": 0.33481213450431824, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19445767998695374, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1042.4085693359375, + "completions/mean_terminated_length": 806.9393920898438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4372702573118108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13246008378885643, + "kl": 0.01922607421875, + "learning_rate": 7.230190846971229e-07, + "loss": 0.0997, + "num_tokens": 1183123860.0, + "reward": 1.4319196939468384, + "reward_std": 0.378562867641449, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18263639509677887, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1119.8326416015625, + "completions/mean_terminated_length": 866.696044921875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4374833519791167, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13260994669542384, + "kl": 0.017608642578125, + "learning_rate": 7.227100041825057e-07, + "loss": 0.0733, + "num_tokens": 1183702585.0, + "reward": 1.3856027126312256, + "reward_std": 0.2696685194969177, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.19084173440933228, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1015.7879638671875, + "completions/mean_terminated_length": 818.1303100585938, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.43769644664642265, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1337741084175181, + "kl": 0.0203857421875, + "learning_rate": 7.22400828086121e-07, + "loss": 0.0901, + "num_tokens": 1184224266.0, + "reward": 1.4988839626312256, + "reward_std": 0.3660476505756378, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9386160969734192, + "rewards/tag_count_reward/std": 0.18955937027931213, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1102.04248046875, + "completions/mean_terminated_length": 857.5814819335938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4379095413137286, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12222667940818496, + "kl": 0.01629638671875, + "learning_rate": 7.220915565790742e-07, + "loss": 0.0558, + "num_tokens": 1184787341.0, + "reward": 1.5195313692092896, + "reward_std": 0.3279862403869629, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.13563236594200134, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 925.1652221679688, + "completions/mean_terminated_length": 799.78662109375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.43812263598103457, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14424101838351153, + "kl": 0.021728515625, + "learning_rate": 7.217821898325234e-07, + "loss": 0.0903, + "num_tokens": 1185267255.0, + "reward": 1.5457589626312256, + "reward_std": 0.2965417802333832, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.16277234256267548, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 949.169677734375, + "completions/mean_terminated_length": 808.0100708007812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4383357306483405, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13055713084509007, + "kl": 0.020538330078125, + "learning_rate": 7.21472728017679e-07, + "loss": 0.0686, + "num_tokens": 1185759203.0, + "reward": 1.6322544813156128, + "reward_std": 0.2778674066066742, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.15070870518684387, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1055.7857666015625, + "completions/mean_terminated_length": 830.158935546875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.4385488253156465, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11103727218472582, + "kl": 0.017120361328125, + "learning_rate": 7.211631713058049e-07, + "loss": 0.0714, + "num_tokens": 1186300243.0, + "reward": 1.5145089626312256, + "reward_std": 0.3523017466068268, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17146752774715424, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 898.9285888671875, + "completions/mean_terminated_length": 780.05908203125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.43876191998295244, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12508904020978298, + "kl": 0.022247314453125, + "learning_rate": 7.20853519868217e-07, + "loss": 0.0208, + "num_tokens": 1186773507.0, + "reward": 1.5719866752624512, + "reward_std": 0.3009365499019623, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.14503538608551025, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1048.169677734375, + "completions/mean_terminated_length": 856.7127685546875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4389750146502584, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1294436413958662, + "kl": 0.02056884765625, + "learning_rate": 7.205437738762835e-07, + "loss": 0.0582, + "num_tokens": 1187315263.0, + "reward": 1.4575893878936768, + "reward_std": 0.3215850591659546, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.1791059374809265, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1130.388427734375, + "completions/mean_terminated_length": 859.8786010742188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4391881093175643, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12964819241148098, + "kl": 0.0167236328125, + "learning_rate": 7.202339335014253e-07, + "loss": 0.074, + "num_tokens": 1187893517.0, + "reward": 1.4531251192092896, + "reward_std": 0.3764910101890564, + "rewards/accuracy_reward/mean": 0.5162037014961243, + "rewards/accuracy_reward/std": 0.5003167986869812, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.1762816607952118, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 946.72998046875, + "completions/mean_terminated_length": 782.9512939453125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.43940120398487026, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11200487028017626, + "kl": 0.0183563232421875, + "learning_rate": 7.199239989151151e-07, + "loss": 0.0422, + "num_tokens": 1188385524.0, + "reward": 1.575334906578064, + "reward_std": 0.27904102206230164, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1304718255996704, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 975.0670166015625, + "completions/mean_terminated_length": 809.1494750976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4396142986521762, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11965050224347294, + "kl": 0.018798828125, + "learning_rate": 7.196139702888781e-07, + "loss": 0.0424, + "num_tokens": 1188890066.0, + "reward": 1.6082589626312256, + "reward_std": 0.376095712184906, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15465489029884338, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 965.8504638671875, + "completions/mean_terminated_length": 775.5512084960938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.4398273933194822, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1319501482628102, + "kl": 0.01934814453125, + "learning_rate": 7.193038477942912e-07, + "loss": 0.0772, + "num_tokens": 1189387231.0, + "reward": 1.5820313692092896, + "reward_std": 0.29898250102996826, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.1453535109758377, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 996.5826416015625, + "completions/mean_terminated_length": 805.1636352539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44004048798678813, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13854250302472904, + "kl": 0.019134521484375, + "learning_rate": 7.189936316029839e-07, + "loss": 0.0832, + "num_tokens": 1189901076.0, + "reward": 1.532366156578064, + "reward_std": 0.2984950542449951, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17314842343330383, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1034.0045166015625, + "completions/mean_terminated_length": 839.8350830078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4402535826540941, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13100043858490118, + "kl": 0.018463134765625, + "learning_rate": 7.186833218866367e-07, + "loss": 0.0813, + "num_tokens": 1190434566.0, + "reward": 1.5273438692092896, + "reward_std": 0.3369886577129364, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.19120772182941437, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 925.1719360351562, + "completions/mean_terminated_length": 784.113037109375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.44046667732140005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14068999215594574, + "kl": 0.019775390625, + "learning_rate": 7.183729188169825e-07, + "loss": 0.0508, + "num_tokens": 1190917651.0, + "reward": 1.5452009439468384, + "reward_std": 0.3048968017101288, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.14560237526893616, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1011.5201416015625, + "completions/mean_terminated_length": 857.376953125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.440679771988706, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12621636009318526, + "kl": 0.01812744140625, + "learning_rate": 7.180624225658057e-07, + "loss": 0.0823, + "num_tokens": 1191445756.0, + "reward": 1.4743304252624512, + "reward_std": 0.34153011441230774, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18195155262947083, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 935.3772583007812, + "completions/mean_terminated_length": 792.44580078125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.4408928666560119, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15364702306585823, + "kl": 0.02020263671875, + "learning_rate": 7.17751833304942e-07, + "loss": 0.0613, + "num_tokens": 1191928821.0, + "reward": 1.5976563692092896, + "reward_std": 0.2924443781375885, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.16456767916679382, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 920.247802734375, + "completions/mean_terminated_length": 797.4232788085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44110596132331786, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13565063109202755, + "kl": 0.022186279296875, + "learning_rate": 7.174411512062789e-07, + "loss": 0.0989, + "num_tokens": 1192404868.0, + "reward": 1.6171876192092896, + "reward_std": 0.2753956615924835, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.16662296652793884, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 966.79248046875, + "completions/mean_terminated_length": 809.1738891601562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4413190559906238, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13827367269633276, + "kl": 0.020416259765625, + "learning_rate": 7.171303764417552e-07, + "loss": 0.1372, + "num_tokens": 1192908455.0, + "reward": 1.4977679252624512, + "reward_std": 0.39043644070625305, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.18991795182228088, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1043.8304443359375, + "completions/mean_terminated_length": 801.8282470703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4415321506579298, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12186395133538762, + "kl": 0.018402099609375, + "learning_rate": 7.168195091833605e-07, + "loss": 0.0971, + "num_tokens": 1193438331.0, + "reward": 1.477678656578064, + "reward_std": 0.347394198179245, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1853403002023697, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 985.7857666015625, + "completions/mean_terminated_length": 733.4364624023438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44174524532523574, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12946785140580697, + "kl": 0.019256591796875, + "learning_rate": 7.165085496031368e-07, + "loss": 0.0146, + "num_tokens": 1193949067.0, + "reward": 1.5306919813156128, + "reward_std": 0.270818829536438, + "rewards/accuracy_reward/mean": 0.5949074029922485, + "rewards/accuracy_reward/std": 0.49147912859916687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1540675312280655, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 993.654052734375, + "completions/mean_terminated_length": 801.701904296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4419583399925417, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13845541621618165, + "kl": 0.0198974609375, + "learning_rate": 7.161974978731759e-07, + "loss": 0.0772, + "num_tokens": 1194456992.0, + "reward": 1.5340402126312256, + "reward_std": 0.33164268732070923, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.19119465351104736, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1040.1763916015625, + "completions/mean_terminated_length": 850.3739624023438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.44217143465984765, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11387679921948027, + "kl": 0.019378662109375, + "learning_rate": 7.158863541656214e-07, + "loss": 0.0344, + "num_tokens": 1194986127.0, + "reward": 1.4793527126312256, + "reward_std": 0.3044973909854889, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.15676647424697876, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 941.6741333007812, + "completions/mean_terminated_length": 818.1389770507812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4423845293271536, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13489095553891556, + "kl": 0.021697998046875, + "learning_rate": 7.155751186526673e-07, + "loss": 0.0716, + "num_tokens": 1195478269.0, + "reward": 1.6082589626312256, + "reward_std": 0.3194030523300171, + "rewards/accuracy_reward/mean": 0.6651785969734192, + "rewards/accuracy_reward/std": 0.47245559096336365, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.18955937027931213, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 994.5938110351562, + "completions/mean_terminated_length": 847.1704711914062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.44259762399445957, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.3116568397646185, + "kl": 0.022247314453125, + "learning_rate": 7.152637915065585e-07, + "loss": 0.0251, + "num_tokens": 1195998231.0, + "reward": 1.4821429252624512, + "reward_std": 0.25211286544799805, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.158249631524086, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1002.747802734375, + "completions/mean_terminated_length": 837.9922485351562, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.44281071866176547, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13330991813049012, + "kl": 0.019134521484375, + "learning_rate": 7.149523728995913e-07, + "loss": 0.0509, + "num_tokens": 1196514918.0, + "reward": 1.5418527126312256, + "reward_std": 0.2718406915664673, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.15814605355262756, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1001.638427734375, + "completions/mean_terminated_length": 767.2076416015625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4430238133290714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13489759213789615, + "kl": 0.020843505859375, + "learning_rate": 7.146408630041116e-07, + "loss": 0.0322, + "num_tokens": 1197031412.0, + "reward": 1.5669643878936768, + "reward_std": 0.2913854718208313, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.14875037968158722, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1172.4554443359375, + "completions/mean_terminated_length": 887.5148315429688, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4432369079963774, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11855766035925569, + "kl": 0.0160980224609375, + "learning_rate": 7.143292619925164e-07, + "loss": 0.063, + "num_tokens": 1197630752.0, + "reward": 1.3158482313156128, + "reward_std": 0.3204297423362732, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.21593762934207916, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 992.2410888671875, + "completions/mean_terminated_length": 853.6060791015625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.44345000266368334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1371884321618812, + "kl": 0.02081298828125, + "learning_rate": 7.14017570037253e-07, + "loss": 0.0566, + "num_tokens": 1198139740.0, + "reward": 1.473772406578064, + "reward_std": 0.32192912697792053, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16576191782951355, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 1110.9910888671875, + "completions/mean_terminated_length": 901.06005859375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4436630973309893, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1288664620879469, + "kl": 0.0181884765625, + "learning_rate": 7.137057873108192e-07, + "loss": 0.0753, + "num_tokens": 1198710408.0, + "reward": 1.5033482313156128, + "reward_std": 0.33159375190734863, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19537115097045898, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1164.009033203125, + "completions/mean_terminated_length": 1003.0712890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44387619199829526, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11183260971800611, + "kl": 0.015350341796875, + "learning_rate": 7.133939139857625e-07, + "loss": 0.0564, + "num_tokens": 1199306652.0, + "reward": 1.4910714626312256, + "reward_std": 0.3213631212711334, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.17503081262111664, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1123.4442138671875, + "completions/mean_terminated_length": 894.2367553710938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4440892866656012, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12110334897709045, + "kl": 0.0189208984375, + "learning_rate": 7.130819502346813e-07, + "loss": 0.0564, + "num_tokens": 1199877347.0, + "reward": 1.4754464626312256, + "reward_std": 0.3657534718513489, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8973214030265808, + "rewards/tag_count_reward/std": 0.24532216787338257, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1111.03125, + "completions/mean_terminated_length": 907.3424072265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44430238133290717, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1268828164329816, + "kl": 0.018157958984375, + "learning_rate": 7.127698962302234e-07, + "loss": 0.0879, + "num_tokens": 1200448241.0, + "reward": 1.4084821939468384, + "reward_std": 0.2666908800601959, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.17030231654644012, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1065.0023193359375, + "completions/mean_terminated_length": 831.472412109375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4445154760002131, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13311921114544228, + "kl": 0.019378662109375, + "learning_rate": 7.124577521450871e-07, + "loss": 0.0708, + "num_tokens": 1200985794.0, + "reward": 1.4838169813156128, + "reward_std": 0.3126163184642792, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9280133843421936, + "rewards/tag_count_reward/std": 0.21734584867954254, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1008.7076416015625, + "completions/mean_terminated_length": 803.0722045898438, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.44472857066751903, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1314362050223909, + "kl": 0.019500732421875, + "learning_rate": 7.121455181520199e-07, + "loss": 0.0777, + "num_tokens": 1201505615.0, + "reward": 1.5044643878936768, + "reward_std": 0.3508971035480499, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.008928571827709675, + "rewards/format_reward/std": 0.09417349100112915, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.17533011734485626, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1229.3773193359375, + "completions/mean_terminated_length": 909.046630859375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.444941665334825, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11436702343252982, + "kl": 0.015777587890625, + "learning_rate": 7.118331944238196e-07, + "loss": 0.0475, + "num_tokens": 1202123672.0, + "reward": 1.3694196939468384, + "reward_std": 0.33544400334358215, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.23562191426753998, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1111.9129638671875, + "completions/mean_terminated_length": 908.415771484375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.44515476000213094, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11709825199760196, + "kl": 0.018524169921875, + "learning_rate": 7.115207811333335e-07, + "loss": 0.0724, + "num_tokens": 1202689249.0, + "reward": 1.4263393878936768, + "reward_std": 0.3321261405944824, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.17612579464912415, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1120.2098388671875, + "completions/mean_terminated_length": 853.6034545898438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4453678546694369, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13520223058296615, + "kl": 0.0177001953125, + "learning_rate": 7.112082784534585e-07, + "loss": 0.105, + "num_tokens": 1203266719.0, + "reward": 1.3286831378936768, + "reward_std": 0.3107526898384094, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.22645428776741028, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1009.966552734375, + "completions/mean_terminated_length": 864.6946411132812, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.44558094933674286, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11838570950087995, + "kl": 0.018798828125, + "learning_rate": 7.108956865571408e-07, + "loss": 0.0713, + "num_tokens": 1203787872.0, + "reward": 1.5976563692092896, + "reward_std": 0.3133440613746643, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17458714544773102, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1026.93310546875, + "completions/mean_terminated_length": 831.4095458984375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.4457940440040488, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1483449647535498, + "kl": 0.0191650390625, + "learning_rate": 7.10583005617376e-07, + "loss": 0.1028, + "num_tokens": 1204314178.0, + "reward": 1.5100446939468384, + "reward_std": 0.28503119945526123, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.19173339009284973, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1094.024658203125, + "completions/mean_terminated_length": 899.1263427734375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4460071386713548, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.5877693997123878, + "kl": 0.03076171875, + "learning_rate": 7.10270235807209e-07, + "loss": 0.0881, + "num_tokens": 1204880909.0, + "reward": 1.4670759439468384, + "reward_std": 0.3199712038040161, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.14945265650749207, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 984.7656860351562, + "completions/mean_terminated_length": 801.0654296875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4462202333386607, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12127665779475828, + "kl": 0.018402099609375, + "learning_rate": 7.099573772997344e-07, + "loss": 0.0552, + "num_tokens": 1205392820.0, + "reward": 1.5373884439468384, + "reward_std": 0.3277756869792938, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.158248633146286, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1054.83935546875, + "completions/mean_terminated_length": 815.4902954101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.44643332800596663, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12048358979440574, + "kl": 0.0168914794921875, + "learning_rate": 7.096444302680951e-07, + "loss": 0.0474, + "num_tokens": 1205938412.0, + "reward": 1.454241156578064, + "reward_std": 0.337785929441452, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.16701209545135498, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1009.5870971679688, + "completions/mean_terminated_length": 790.6784057617188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4466464226732726, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12170217902487429, + "kl": 0.0174560546875, + "learning_rate": 7.093313948854834e-07, + "loss": 0.0649, + "num_tokens": 1206455779.0, + "reward": 1.5697544813156128, + "reward_std": 0.3142164945602417, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.17088682949543, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1025.5960693359375, + "completions/mean_terminated_length": 826.5679931640625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.44685951734057855, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12604881065521792, + "kl": 0.018798828125, + "learning_rate": 7.090182713251404e-07, + "loss": 0.06, + "num_tokens": 1206979406.0, + "reward": 1.4720982313156128, + "reward_std": 0.3795251250267029, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1795479953289032, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 954.6942138671875, + "completions/mean_terminated_length": 759.050048828125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4470726120078845, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14578435836525083, + "kl": 0.021575927734375, + "learning_rate": 7.08705059760356e-07, + "loss": 0.1025, + "num_tokens": 1207480229.0, + "reward": 1.540178656578064, + "reward_std": 0.35240188241004944, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.008928571827709675, + "rewards/format_reward/std": 0.09417349100112915, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.15763309597969055, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 980.5022583007812, + "completions/mean_terminated_length": 805.8207397460938, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.44728570667519046, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13672660625571847, + "kl": 0.018768310546875, + "learning_rate": 7.083917603644688e-07, + "loss": 0.0618, + "num_tokens": 1207989734.0, + "reward": 1.5703126192092896, + "reward_std": 0.2972128987312317, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17375299334526062, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1044.1629638671875, + "completions/mean_terminated_length": 839.0779418945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4474988013424964, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.39424230950159644, + "kl": 0.0173797607421875, + "learning_rate": 7.08078373310866e-07, + "loss": 0.0942, + "num_tokens": 1208530639.0, + "reward": 1.3482143878936768, + "reward_std": 0.3589441180229187, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9241071343421936, + "rewards/tag_count_reward/std": 0.23192918300628662, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 906.9933471679688, + "completions/mean_terminated_length": 753.8961791992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4477118960098024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7440586227927102, + "kl": 0.053009033203125, + "learning_rate": 7.077648987729837e-07, + "loss": 0.1303, + "num_tokens": 1209006060.0, + "reward": 1.4843751192092896, + "reward_std": 0.377036988735199, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.23015688359737396, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 993.97998046875, + "completions/mean_terminated_length": 805.3658447265625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4479249906771083, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13184477168399156, + "kl": 0.019744873046875, + "learning_rate": 7.074513369243056e-07, + "loss": 0.0945, + "num_tokens": 1209524547.0, + "reward": 1.5234376192092896, + "reward_std": 0.41563335061073303, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9252232313156128, + "rewards/tag_count_reward/std": 0.21083608269691467, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 925.263427734375, + "completions/mean_terminated_length": 777.8333129882812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44813808534441424, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1505201871754739, + "kl": 0.020660400390625, + "learning_rate": 7.071376879383647e-07, + "loss": 0.0833, + "num_tokens": 1210007881.0, + "reward": 1.4760044813156128, + "reward_std": 0.31187623739242554, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995505809783936, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.1533121019601822, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1011.9844360351562, + "completions/mean_terminated_length": 836.1593017578125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4483511800117202, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13061578238488283, + "kl": 0.020660400390625, + "learning_rate": 7.068239519887411e-07, + "loss": 0.0705, + "num_tokens": 1210530258.0, + "reward": 1.5468751192092896, + "reward_std": 0.3386935591697693, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.18765640258789062, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1115.29248046875, + "completions/mean_terminated_length": 896.8898315429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.44856427467902615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12882815987970697, + "kl": 0.018402099609375, + "learning_rate": 7.065101292490639e-07, + "loss": 0.0733, + "num_tokens": 1211098053.0, + "reward": 1.4715402126312256, + "reward_std": 0.39873239398002625, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.22931937873363495, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1096.5692138671875, + "completions/mean_terminated_length": 911.3572998046875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4487773693463321, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1192902968973232, + "kl": 0.018310546875, + "learning_rate": 7.061962198930102e-07, + "loss": 0.0684, + "num_tokens": 1211661124.0, + "reward": 1.4771206378936768, + "reward_std": 0.3917034864425659, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9056919813156128, + "rewards/tag_count_reward/std": 0.23448732495307922, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1025.53125, + "completions/mean_terminated_length": 809.9838256835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44899046401363807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13020058997101633, + "kl": 0.018798828125, + "learning_rate": 7.058822240943044e-07, + "loss": 0.1142, + "num_tokens": 1212191330.0, + "reward": 1.4492188692092896, + "reward_std": 0.34832221269607544, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9157366156578064, + "rewards/tag_count_reward/std": 0.22931937873363495, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 949.997802734375, + "completions/mean_terminated_length": 760.2905883789062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.449203558680944, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14340437507437326, + "kl": 0.021240234375, + "learning_rate": 7.055681420267196e-07, + "loss": 0.0867, + "num_tokens": 1212683425.0, + "reward": 1.4760044813156128, + "reward_std": 0.4063599705696106, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.23134104907512665, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1002.0402221679688, + "completions/mean_terminated_length": 798.4266357421875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.44941665334825, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13909452443393927, + "kl": 0.022216796875, + "learning_rate": 7.052539738640757e-07, + "loss": 0.0767, + "num_tokens": 1213200563.0, + "reward": 1.4614956378936768, + "reward_std": 0.30341097712516785, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.1856423318386078, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1066.9285888671875, + "completions/mean_terminated_length": 850.3978271484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4496297480155559, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.5242757140992893, + "kl": 0.019989013671875, + "learning_rate": 7.049397197802408e-07, + "loss": 0.133, + "num_tokens": 1213748995.0, + "reward": 1.3906251192092896, + "reward_std": 0.3241032063961029, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.1884000152349472, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1905.0, + "completions/mean_length": 867.1719360351562, + "completions/mean_terminated_length": 659.5196533203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.44984284268286184, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13811446746886555, + "kl": 0.020538330078125, + "learning_rate": 7.04625379949131e-07, + "loss": 0.1121, + "num_tokens": 1214212672.0, + "reward": 1.594866156578064, + "reward_std": 0.3024706244468689, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969305515289307, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17132186889648438, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 975.3795166015625, + "completions/mean_terminated_length": 834.5302734375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4500559373501678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13469183827692344, + "kl": 0.020263671875, + "learning_rate": 7.043109545447087e-07, + "loss": 0.0941, + "num_tokens": 1214719066.0, + "reward": 1.6395089626312256, + "reward_std": 0.33141523599624634, + "rewards/accuracy_reward/mean": 0.7075892686843872, + "rewards/accuracy_reward/std": 0.4553784728050232, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.19385728240013123, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1035.6763916015625, + "completions/mean_terminated_length": 812.2479248046875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.45026903201747376, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12079807317986545, + "kl": 0.01953125, + "learning_rate": 7.039964437409844e-07, + "loss": 0.0777, + "num_tokens": 1215256905.0, + "reward": 1.5055804252624512, + "reward_std": 0.389077752828598, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.20431670546531677, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1115.609375, + "completions/mean_terminated_length": 931.1256713867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4504821266847797, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12347592414109657, + "kl": 0.018707275390625, + "learning_rate": 7.036818477120163e-07, + "loss": 0.0731, + "num_tokens": 1215824042.0, + "reward": 1.4687501192092896, + "reward_std": 0.351925253868103, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.18240725994110107, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1060.665283203125, + "completions/mean_terminated_length": 862.1394653320312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4506952213520857, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1338831279336727, + "kl": 0.0203857421875, + "learning_rate": 7.033671666319085e-07, + "loss": 0.0839, + "num_tokens": 1216365668.0, + "reward": 1.4944196939468384, + "reward_std": 0.38135501742362976, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9341517686843872, + "rewards/tag_count_reward/std": 0.19959399104118347, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1099.071533203125, + "completions/mean_terminated_length": 867.1111450195312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.45090831601939163, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1436092920952251, + "kl": 0.020538330078125, + "learning_rate": 7.030524006748135e-07, + "loss": 0.101, + "num_tokens": 1216927588.0, + "reward": 1.4241071939468384, + "reward_std": 0.3675498366355896, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.20673725008964539, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1129.5491943359375, + "completions/mean_terminated_length": 911.3536376953125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4511214106866976, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12107033314166665, + "kl": 0.017791748046875, + "learning_rate": 7.027375500149297e-07, + "loss": 0.099, + "num_tokens": 1217505306.0, + "reward": 1.4051339626312256, + "reward_std": 0.4194040596485138, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9162946343421936, + "rewards/tag_count_reward/std": 0.21078869700431824, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1123.6875, + "completions/mean_terminated_length": 952.5184936523438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4513345053540035, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1260000089118119, + "kl": 0.0186767578125, + "learning_rate": 7.024226148265032e-07, + "loss": 0.1062, + "num_tokens": 1218075294.0, + "reward": 1.4921876192092896, + "reward_std": 0.40922603011131287, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9095982313156128, + "rewards/tag_count_reward/std": 0.22355039417743683, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1088.6451416015625, + "completions/mean_terminated_length": 898.8262329101562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.45154760002130945, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11318952807575937, + "kl": 0.0194091796875, + "learning_rate": 7.021075952838262e-07, + "loss": 0.0176, + "num_tokens": 1218628735.0, + "reward": 1.5016741752624512, + "reward_std": 0.3029335141181946, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18560869991779327, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1002.7701416015625, + "completions/mean_terminated_length": 792.603271484375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4517606946886154, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13267192512029619, + "kl": 0.0208740234375, + "learning_rate": 7.017924915612381e-07, + "loss": 0.0859, + "num_tokens": 1219143912.0, + "reward": 1.5273438692092896, + "reward_std": 0.33173060417175293, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9313616156578064, + "rewards/tag_count_reward/std": 0.1954566091299057, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1136.9263916015625, + "completions/mean_terminated_length": 907.8854370117188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.45197378935592136, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.144570626548426, + "kl": 0.01812744140625, + "learning_rate": 7.014773038331247e-07, + "loss": 0.1206, + "num_tokens": 1219720903.0, + "reward": 1.493303656578064, + "reward_std": 0.42330852150917053, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9107142686843872, + "rewards/tag_count_reward/std": 0.23136872053146362, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1104.6004638671875, + "completions/mean_terminated_length": 877.2437133789062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4521868840232273, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1237456793686293, + "kl": 0.0191650390625, + "learning_rate": 7.011620322739183e-07, + "loss": 0.0869, + "num_tokens": 1220287428.0, + "reward": 1.4129464626312256, + "reward_std": 0.36484357714653015, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.2053801566362381, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1059.3148193359375, + "completions/mean_terminated_length": 847.64501953125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4523999786905333, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1254572340069438, + "kl": 0.018341064453125, + "learning_rate": 7.008466770580972e-07, + "loss": 0.1142, + "num_tokens": 1220834225.0, + "reward": 1.4927456378936768, + "reward_std": 0.3333226144313812, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.1951945573091507, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1140.078125, + "completions/mean_terminated_length": 895.7365112304688, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.45261307335783924, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11762407525780962, + "kl": 0.016754150390625, + "learning_rate": 7.005312383601869e-07, + "loss": 0.1207, + "num_tokens": 1221415252.0, + "reward": 1.4040179252624512, + "reward_std": 0.3198819160461426, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.18765640258789062, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1041.7366943359375, + "completions/mean_terminated_length": 832.8894653320312, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4528261680251452, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13560181056531387, + "kl": 0.02166748046875, + "learning_rate": 7.002157163547583e-07, + "loss": 0.0721, + "num_tokens": 1221946702.0, + "reward": 1.6021206378936768, + "reward_std": 0.32840245962142944, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.18103241920471191, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1144.743408203125, + "completions/mean_terminated_length": 881.835693359375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4530392626924511, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12435096100083422, + "kl": 0.018402099609375, + "learning_rate": 6.999001112164288e-07, + "loss": 0.0844, + "num_tokens": 1222527899.0, + "reward": 1.4531251192092896, + "reward_std": 0.3638610243797302, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19418221712112427, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 961.4553833007812, + "completions/mean_terminated_length": 828.02001953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.45325235735975705, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12710405916485282, + "kl": 0.02288818359375, + "learning_rate": 6.995844231198616e-07, + "loss": 0.0502, + "num_tokens": 1223028551.0, + "reward": 1.6623884439468384, + "reward_std": 0.35487285256385803, + "rewards/accuracy_reward/mean": 0.7075892686843872, + "rewards/accuracy_reward/std": 0.4553784728050232, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.16183683276176453, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1027.7991943359375, + "completions/mean_terminated_length": 802.6321411132812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.453465452027063, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13059197610576262, + "kl": 0.021148681640625, + "learning_rate": 6.992686522397658e-07, + "loss": 0.1252, + "num_tokens": 1223560781.0, + "reward": 1.5122768878936768, + "reward_std": 0.40864914655685425, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.2057536393404007, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 960.9420166015625, + "completions/mean_terminated_length": 824.3768920898438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.45367854669436897, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1341443597428575, + "kl": 0.019683837890625, + "learning_rate": 6.989527987508966e-07, + "loss": 0.118, + "num_tokens": 1224068115.0, + "reward": 1.6434152126312256, + "reward_std": 0.2862638831138611, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13073945045471191, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1269.935302734375, + "completions/mean_terminated_length": 998.081298828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4538916413616749, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.09929585381278933, + "kl": 0.015869140625, + "learning_rate": 6.986368628280547e-07, + "loss": 0.0768, + "num_tokens": 1224708054.0, + "reward": 1.356584906578064, + "reward_std": 0.39402303099632263, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9302455186843872, + "rewards/tag_count_reward/std": 0.2075621336698532, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1052.9598388671875, + "completions/mean_terminated_length": 813.1578979492188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4541047360289809, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12388351827629222, + "kl": 0.019134521484375, + "learning_rate": 6.983208446460863e-07, + "loss": 0.0427, + "num_tokens": 1225247524.0, + "reward": 1.5000001192092896, + "reward_std": 0.3233374357223511, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12415824085474014, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.14943698048591614, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 996.8035888671875, + "completions/mean_terminated_length": 788.8128662109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.45431783069628684, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1281292900187781, + "kl": 0.021148681640625, + "learning_rate": 6.980047443798835e-07, + "loss": 0.0451, + "num_tokens": 1225768668.0, + "reward": 1.4436384439468384, + "reward_std": 0.2854624092578888, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16228362917900085, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1016.5000610351562, + "completions/mean_terminated_length": 818.9786987304688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4545309253635928, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11896062847183478, + "kl": 0.018768310546875, + "learning_rate": 6.976885622043836e-07, + "loss": 0.0581, + "num_tokens": 1226292604.0, + "reward": 1.5212054252624512, + "reward_std": 0.3162801265716553, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1849188506603241, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1236.4241943359375, + "completions/mean_terminated_length": 901.041015625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4547440200308987, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11863223305708431, + "kl": 0.0163116455078125, + "learning_rate": 6.973722982945688e-07, + "loss": 0.0504, + "num_tokens": 1226918906.0, + "reward": 1.407366156578064, + "reward_std": 0.3814505934715271, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18618372082710266, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1013.9375610351562, + "completions/mean_terminated_length": 815.9254760742188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.45495711469820466, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13058947363371667, + "kl": 0.0185546875, + "learning_rate": 6.970559528254674e-07, + "loss": 0.065, + "num_tokens": 1227445550.0, + "reward": 1.4938616752624512, + "reward_std": 0.3140023350715637, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.01116071455180645, + "rewards/format_reward/std": 0.10517053306102753, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.1503853052854538, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1101.4576416015625, + "completions/mean_terminated_length": 860.1820678710938, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.4551702093655106, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12247783944159751, + "kl": 0.017578125, + "learning_rate": 6.967395259721523e-07, + "loss": 0.1107, + "num_tokens": 1228011579.0, + "reward": 1.4481027126312256, + "reward_std": 0.3940381407737732, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1929948478937149, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1016.2410888671875, + "completions/mean_terminated_length": 837.9790649414062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.45538330403281657, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11448485578315898, + "kl": 0.01849365234375, + "learning_rate": 6.964230179097414e-07, + "loss": 0.0808, + "num_tokens": 1228528951.0, + "reward": 1.4720982313156128, + "reward_std": 0.26490485668182373, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15099525451660156, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1049.85498046875, + "completions/mean_terminated_length": 849.155517578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.45559639870012253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13943064016708315, + "kl": 0.01898193359375, + "learning_rate": 6.96106428813398e-07, + "loss": 0.1359, + "num_tokens": 1229071862.0, + "reward": 1.540178656578064, + "reward_std": 0.3550258278846741, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.1970413774251938, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 948.7188110351562, + "completions/mean_terminated_length": 768.8363647460938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4558094933674285, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14212882734125562, + "kl": 0.022186279296875, + "learning_rate": 6.957897588583298e-07, + "loss": 0.082, + "num_tokens": 1229556872.0, + "reward": 1.6043527126312256, + "reward_std": 0.3201252520084381, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.17706511914730072, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1111.290283203125, + "completions/mean_terminated_length": 888.7569580078125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.45602258803473444, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12307676872354202, + "kl": 0.01739501953125, + "learning_rate": 6.954730082197891e-07, + "loss": 0.0911, + "num_tokens": 1230129178.0, + "reward": 1.5055804252624512, + "reward_std": 0.39122167229652405, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.4966535270214081, + "rewards/format_reward/mean": 0.013392857275903225, + "rewards/format_reward/std": 0.11507843434810638, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17773102223873138, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 916.7098388671875, + "completions/mean_terminated_length": 689.2386474609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4562356827020404, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1402390928030813, + "kl": 0.018890380859375, + "learning_rate": 6.951561770730736e-07, + "loss": 0.0944, + "num_tokens": 1230608680.0, + "reward": 1.5742188692092896, + "reward_std": 0.3198275566101074, + "rewards/accuracy_reward/mean": 0.6481481194496155, + "rewards/accuracy_reward/std": 0.4781017005443573, + "rewards/format_reward/mean": 0.008928571827709675, + "rewards/format_reward/std": 0.09417349100112915, + "rewards/tag_count_reward/mean": 0.9402901530265808, + "rewards/tag_count_reward/std": 0.18750248849391937, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1068.5848388671875, + "completions/mean_terminated_length": 871.6514892578125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.45644877736934636, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13112911467592722, + "kl": 0.0213623046875, + "learning_rate": 6.948392655935247e-07, + "loss": 0.0739, + "num_tokens": 1231155182.0, + "reward": 1.5725446939468384, + "reward_std": 0.3506115674972534, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.14909827709197998, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1123.7857666015625, + "completions/mean_terminated_length": 868.3760375976562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.45666187203665226, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.17295181095191361, + "kl": 0.019500732421875, + "learning_rate": 6.945222739565288e-07, + "loss": 0.0433, + "num_tokens": 1231731470.0, + "reward": 1.3364956378936768, + "reward_std": 0.3273019790649414, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.01116071455180645, + "rewards/format_reward/std": 0.10517053306102753, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.1753861904144287, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1133.52685546875, + "completions/mean_terminated_length": 887.4220581054688, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4568749667039582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12518814156220218, + "kl": 0.019012451171875, + "learning_rate": 6.942052023375166e-07, + "loss": 0.0843, + "num_tokens": 1232303114.0, + "reward": 1.4162946939468384, + "reward_std": 0.3659404516220093, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0223214291036129, + "rewards/format_reward/std": 0.14789186418056488, + "rewards/tag_count_reward/mean": 0.9319196343421936, + "rewards/tag_count_reward/std": 0.20024341344833374, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1081.477783203125, + "completions/mean_terminated_length": 810.8514404296875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4570880613712642, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11881452334494458, + "kl": 0.0169525146484375, + "learning_rate": 6.938880509119628e-07, + "loss": 0.1105, + "num_tokens": 1232857488.0, + "reward": 1.4196429252624512, + "reward_std": 0.3262912929058075, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.004464285913854837, + "rewards/format_reward/std": 0.06674052774906158, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.18540765345096588, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1039.9263916015625, + "completions/mean_terminated_length": 807.2940063476562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.45730115603857013, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13260945796253176, + "kl": 0.019500732421875, + "learning_rate": 6.935708198553864e-07, + "loss": 0.0536, + "num_tokens": 1233394015.0, + "reward": 1.477678656578064, + "reward_std": 0.34392327070236206, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.17112135887145996, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 931.1875610351562, + "completions/mean_terminated_length": 741.650146484375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4575142507058761, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14432476253453902, + "kl": 0.02081298828125, + "learning_rate": 6.932535093433509e-07, + "loss": 0.0704, + "num_tokens": 1233871363.0, + "reward": 1.6858259439468384, + "reward_std": 0.24119317531585693, + "rewards/accuracy_reward/mean": 0.7209821343421936, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.13939984142780304, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1076.790283203125, + "completions/mean_terminated_length": 862.4359741210938, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.45772734537318205, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13118731474571235, + "kl": 0.017669677734375, + "learning_rate": 6.929361195514628e-07, + "loss": 0.0882, + "num_tokens": 1234418821.0, + "reward": 1.5937501192092896, + "reward_std": 0.3818773925304413, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.15736359357833862, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 978.607177734375, + "completions/mean_terminated_length": 850.2799682617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.457940440040488, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13992743132810606, + "kl": 0.02081298828125, + "learning_rate": 6.926186506553735e-07, + "loss": 0.0729, + "num_tokens": 1234923605.0, + "reward": 1.5312501192092896, + "reward_std": 0.29126328229904175, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.0022321429569274187, + "rewards/format_reward/std": 0.047245558351278305, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1632968634366989, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1027.290283203125, + "completions/mean_terminated_length": 818.758056640625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.45815353470779396, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12863829285221157, + "kl": 0.019805908203125, + "learning_rate": 6.923011028307776e-07, + "loss": 0.0959, + "num_tokens": 1235453863.0, + "reward": 1.555803656578064, + "reward_std": 0.3804105520248413, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.02008928544819355, + "rewards/format_reward/std": 0.14046262204647064, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.19212691485881805, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1130.33935546875, + "completions/mean_terminated_length": 899.6424560546875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.45836662937509987, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11561555312554109, + "kl": 0.016754150390625, + "learning_rate": 6.919834762534136e-07, + "loss": 0.0529, + "num_tokens": 1236035551.0, + "reward": 1.3789063692092896, + "reward_std": 0.31697165966033936, + "rewards/accuracy_reward/mean": 0.44212964177131653, + "rewards/accuracy_reward/std": 0.4972155690193176, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.1761815994977951, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1011.7991333007812, + "completions/mean_terminated_length": 848.4703369140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4585797240424058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12841370085015966, + "kl": 0.019378662109375, + "learning_rate": 6.916657710990632e-07, + "loss": 0.1079, + "num_tokens": 1236554213.0, + "reward": 1.587053656578064, + "reward_std": 0.3580479025840759, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.01116071455180645, + "rewards/format_reward/std": 0.10517053306102753, + "rewards/tag_count_reward/mean": 0.9308035969734192, + "rewards/tag_count_reward/std": 0.19490092992782593, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1083.49560546875, + "completions/mean_terminated_length": 806.3390502929688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4587928187097118, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1292043593074205, + "kl": 0.0205078125, + "learning_rate": 6.913479875435521e-07, + "loss": 0.0693, + "num_tokens": 1237117763.0, + "reward": 1.5000001192092896, + "reward_std": 0.32636529207229614, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12415824085474014, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.1814328134059906, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1092.915283203125, + "completions/mean_terminated_length": 849.4622192382812, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.45900591337701774, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11233016472911361, + "kl": 0.017730712890625, + "learning_rate": 6.910301257627493e-07, + "loss": 0.0547, + "num_tokens": 1237676317.0, + "reward": 1.5078126192092896, + "reward_std": 0.3548882305622101, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.0066964286379516125, + "rewards/format_reward/std": 0.08164843916893005, + "rewards/tag_count_reward/mean": 0.9274553656578064, + "rewards/tag_count_reward/std": 0.2035330981016159, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1227.32373046875, + "completions/mean_terminated_length": 923.6483764648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4592190080443237, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11216898283127252, + "kl": 0.015960693359375, + "learning_rate": 6.907121859325666e-07, + "loss": 0.043, + "num_tokens": 1238294526.0, + "reward": 1.3510044813156128, + "reward_std": 0.40313515067100525, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.02901785634458065, + "rewards/format_reward/std": 0.16804419457912445, + "rewards/tag_count_reward/mean": 0.9268973469734192, + "rewards/tag_count_reward/std": 0.215030238032341, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1045.747802734375, + "completions/mean_terminated_length": 831.1734619140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.45943210271162965, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15084060297496396, + "kl": 0.0203857421875, + "learning_rate": 6.903941682289598e-07, + "loss": 0.0187, + "num_tokens": 1238826493.0, + "reward": 1.4168527126312256, + "reward_std": 0.31627556681632996, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.013392857275903225, + "rewards/format_reward/std": 0.11507843434810638, + "rewards/tag_count_reward/mean": 0.9347098469734192, + "rewards/tag_count_reward/std": 0.19009453058242798, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1096.46435546875, + "completions/mean_terminated_length": 867.1467895507812, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.4596451973789356, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13522629948295403, + "kl": 0.01885986328125, + "learning_rate": 6.900760728279272e-07, + "loss": 0.084, + "num_tokens": 1239391629.0, + "reward": 1.4224331378936768, + "reward_std": 0.3352315127849579, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.01785714365541935, + "rewards/format_reward/std": 0.13258016109466553, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.1904422789812088, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1047.4910888671875, + "completions/mean_terminated_length": 829.9891357421875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.45985829204624157, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13435680829720964, + "kl": 0.020050048828125, + "learning_rate": 6.8975789990551e-07, + "loss": 0.0838, + "num_tokens": 1239930521.0, + "reward": 1.4760044813156128, + "reward_std": 0.427381694316864, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.02901785634458065, + "rewards/format_reward/std": 0.16804419457912445, + "rewards/tag_count_reward/mean": 0.9202008843421936, + "rewards/tag_count_reward/std": 0.2235301434993744, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1022.638427734375, + "completions/mean_terminated_length": 799.7337036132812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.46007138671354747, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12259538793935482, + "kl": 0.01934814453125, + "learning_rate": 6.894396496377929e-07, + "loss": 0.0447, + "num_tokens": 1240456215.0, + "reward": 1.602678656578064, + "reward_std": 0.34144362807273865, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.0245535708963871, + "rewards/format_reward/std": 0.1549331247806549, + "rewards/tag_count_reward/mean": 0.9419642686843872, + "rewards/tag_count_reward/std": 0.18690980970859528, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1072.6898193359375, + "completions/mean_terminated_length": 850.9068603515625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4602844813808534, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12594602883892275, + "kl": 0.01708984375, + "learning_rate": 6.891213222009029e-07, + "loss": 0.1491, + "num_tokens": 1241003948.0, + "reward": 1.4665179252624512, + "reward_std": 0.4376295804977417, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.0446428582072258, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.19785068929195404, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1062.8795166015625, + "completions/mean_terminated_length": 877.352783203125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4604975760481594, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12824733230663907, + "kl": 0.019561767578125, + "learning_rate": 6.888029177710098e-07, + "loss": 0.0591, + "num_tokens": 1241543446.0, + "reward": 1.5937501192092896, + "reward_std": 0.41953450441360474, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.0357142873108387, + "rewards/format_reward/std": 0.18578433990478516, + "rewards/tag_count_reward/mean": 0.9330357313156128, + "rewards/tag_count_reward/std": 0.19496497511863708, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1061.921875, + "completions/mean_terminated_length": 860.4650268554688, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.46071067071546534, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11619496010771999, + "kl": 0.018218994140625, + "learning_rate": 6.884844365243263e-07, + "loss": 0.0535, + "num_tokens": 1242080563.0, + "reward": 1.4927456378936768, + "reward_std": 0.35678109526634216, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.17418713867664337, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.1515924483537674, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1146.2388916015625, + "completions/mean_terminated_length": 859.7970581054688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4609237653827713, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12677080195602433, + "kl": 0.0172271728515625, + "learning_rate": 6.881658786371071e-07, + "loss": 0.0889, + "num_tokens": 1242669262.0, + "reward": 1.3560268878936768, + "reward_std": 0.4218686819076538, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.02008928544819355, + "rewards/format_reward/std": 0.14046260714530945, + "rewards/tag_count_reward/mean": 0.9229910969734192, + "rewards/tag_count_reward/std": 0.2113564908504486, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1023.12060546875, + "completions/mean_terminated_length": 813.736572265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.46113686005007726, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11948263972405707, + "kl": 0.01934814453125, + "learning_rate": 6.8784724428565e-07, + "loss": 0.0415, + "num_tokens": 1243196196.0, + "reward": 1.6540179252624512, + "reward_std": 0.35513198375701904, + "rewards/accuracy_reward/mean": 0.6651785969734192, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.0379464291036129, + "rewards/format_reward/std": 0.19128035008907318, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1700088232755661, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1121.638427734375, + "completions/mean_terminated_length": 888.754150390625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4613499547173832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1253620262710644, + "kl": 0.018798828125, + "learning_rate": 6.875285336462942e-07, + "loss": 0.133, + "num_tokens": 1243769938.0, + "reward": 1.547991156578064, + "reward_std": 0.4231022000312805, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.0401785708963871, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.19677190482616425, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1020.779052734375, + "completions/mean_terminated_length": 849.5755615234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4615630493846892, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1256639312291695, + "kl": 0.019012451171875, + "learning_rate": 6.872097468954222e-07, + "loss": 0.0634, + "num_tokens": 1244289743.0, + "reward": 1.6590402126312256, + "reward_std": 0.39027541875839233, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.0491071417927742, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.161489337682724, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1037.305908203125, + "completions/mean_terminated_length": 834.0831298828125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4617761440519951, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13823255037735785, + "kl": 0.020111083984375, + "learning_rate": 6.868908842094577e-07, + "loss": 0.147, + "num_tokens": 1244823112.0, + "reward": 1.5200893878936768, + "reward_std": 0.5107468962669373, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.0691964253783226, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9151785969734192, + "rewards/tag_count_reward/std": 0.23002667725086212, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1012.6094360351562, + "completions/mean_terminated_length": 817.6153564453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46198923871930103, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1304483099112223, + "kl": 0.020660400390625, + "learning_rate": 6.865719457648668e-07, + "loss": 0.1326, + "num_tokens": 1245347689.0, + "reward": 1.5641741752624512, + "reward_std": 0.41145244240760803, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.0647321417927742, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9324776530265808, + "rewards/tag_count_reward/std": 0.20491690933704376, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1115.5648193359375, + "completions/mean_terminated_length": 881.1536254882812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.462202333386607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13018325682987436, + "kl": 0.01898193359375, + "learning_rate": 6.862529317381578e-07, + "loss": 0.0578, + "num_tokens": 1245916358.0, + "reward": 1.5178571939468384, + "reward_std": 0.4868509769439697, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.1116071417927742, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1875898689031601, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1114.763427734375, + "completions/mean_terminated_length": 870.2816772460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.46241542805391295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2147844532380764, + "kl": 0.019866943359375, + "learning_rate": 6.859338423058802e-07, + "loss": 0.123, + "num_tokens": 1246487404.0, + "reward": 1.5580357313156128, + "reward_std": 0.4804766774177551, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.0915178582072258, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9285714030265808, + "rewards/tag_count_reward/std": 0.21913130581378937, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1005.5670166015625, + "completions/mean_terminated_length": 822.251953125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4626285227212189, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12264472902243723, + "kl": 0.021148681640625, + "learning_rate": 6.856146776446258e-07, + "loss": 0.0947, + "num_tokens": 1247006666.0, + "reward": 1.6121652126312256, + "reward_std": 0.418075293302536, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.0803571417927742, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.16744044423103333, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 963.5313110351562, + "completions/mean_terminated_length": 830.3508911132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.46284161738852486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13568384811676387, + "kl": 0.02032470703125, + "learning_rate": 6.852954379310276e-07, + "loss": 0.1243, + "num_tokens": 1247505096.0, + "reward": 1.7226563692092896, + "reward_std": 0.5068737268447876, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.1428571492433548, + "rewards/format_reward/std": 0.3503182828426361, + "rewards/tag_count_reward/mean": 0.9391741156578064, + "rewards/tag_count_reward/std": 0.18260477483272552, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1175.368408203125, + "completions/mean_terminated_length": 887.943603515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4630547120558308, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12012914069208075, + "kl": 0.017059326171875, + "learning_rate": 6.849761233417606e-07, + "loss": 0.0527, + "num_tokens": 1248103933.0, + "reward": 1.5145089626312256, + "reward_std": 0.5383719205856323, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211319923401, + "rewards/format_reward/mean": 0.2120535671710968, + "rewards/format_reward/std": 0.40921956300735474, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.190032958984375, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1102.87060546875, + "completions/mean_terminated_length": 848.5155639648438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4632678067231368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1309608864990655, + "kl": 0.0185546875, + "learning_rate": 6.846567340535411e-07, + "loss": 0.0883, + "num_tokens": 1248674563.0, + "reward": 1.6819196939468384, + "reward_std": 0.5417090654373169, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.2544642984867096, + "rewards/format_reward/std": 0.4360465705394745, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.21302737295627594, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1068.890625, + "completions/mean_terminated_length": 761.6627807617188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4634809013904427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18373067984644076, + "kl": 0.024200439453125, + "learning_rate": 6.843372702431262e-07, + "loss": 0.1114, + "num_tokens": 1249228274.0, + "reward": 1.6657366752624512, + "reward_std": 0.563531756401062, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.2678571343421936, + "rewards/format_reward/std": 0.4433377981185913, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.1896803379058838, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 905.7857666015625, + "completions/mean_terminated_length": 715.4166870117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46369399605774864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18641426264147656, + "kl": 0.02899169921875, + "learning_rate": 6.840177320873148e-07, + "loss": 0.0986, + "num_tokens": 1249707570.0, + "reward": 1.7946429252624512, + "reward_std": 0.5405769348144531, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.3683035671710968, + "rewards/format_reward/std": 0.4828835427761078, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.14598877727985382, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 941.0558471679688, + "completions/mean_terminated_length": 746.3963012695312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4639070907250546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1544676366259497, + "kl": 0.025360107421875, + "learning_rate": 6.836981197629469e-07, + "loss": 0.1289, + "num_tokens": 1250197243.0, + "reward": 1.9296876192092896, + "reward_std": 0.6100171208381653, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.4944108724594116, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.19679729640483856, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 918.32373046875, + "completions/mean_terminated_length": 740.260986328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46412018539236055, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.134863106794164, + "kl": 0.462158203125, + "learning_rate": 6.833784334469034e-07, + "loss": 0.1227, + "num_tokens": 1250680748.0, + "reward": 2.009486675262451, + "reward_std": 0.595630943775177, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.5111607313156128, + "rewards/format_reward/std": 0.5004342198371887, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.15670275688171387, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 772.2254638671875, + "completions/mean_terminated_length": 633.2796630859375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4643332800596665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1545240759018115, + "kl": 0.03045654296875, + "learning_rate": 6.830586733161063e-07, + "loss": 0.0884, + "num_tokens": 1251089537.0, + "reward": 2.0792412757873535, + "reward_std": 0.6041925549507141, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.5691964030265808, + "rewards/format_reward/std": 0.4957422614097595, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.1879986971616745, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 883.5982666015625, + "completions/mean_terminated_length": 693.0597534179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46454637472697247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16020563431350518, + "kl": 0.030059814453125, + "learning_rate": 6.827388395475183e-07, + "loss": 0.1212, + "num_tokens": 1251550541.0, + "reward": 2.2036831378936768, + "reward_std": 0.6157270669937134, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.6361607313156128, + "rewards/format_reward/std": 0.4816409945487976, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.17139743268489838, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 952.33935546875, + "completions/mean_terminated_length": 732.0321655273438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4647594693942784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14604092317056258, + "kl": 0.026763916015625, + "learning_rate": 6.824189323181429e-07, + "loss": 0.1101, + "num_tokens": 1252046517.0, + "reward": 2.123326063156128, + "reward_std": 0.5768536329269409, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.7477678656578064, + "rewards/format_reward/std": 0.4347793161869049, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16430194675922394, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 968.18310546875, + "completions/mean_terminated_length": 737.0027465820312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4649725640615844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13988484413635563, + "kl": 0.02703857421875, + "learning_rate": 6.820989518050244e-07, + "loss": 0.1146, + "num_tokens": 1252551079.0, + "reward": 2.2098214626312256, + "reward_std": 0.5962929725646973, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.7522321343421936, + "rewards/format_reward/std": 0.4321989119052887, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18524597585201263, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 914.8594360351562, + "completions/mean_terminated_length": 687.01611328125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4651856587288903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1397153083992657, + "kl": 0.028564453125, + "learning_rate": 6.817788981852471e-07, + "loss": 0.11, + "num_tokens": 1253030984.0, + "reward": 2.1863839626312256, + "reward_std": 0.5598536729812622, + "rewards/accuracy_reward/mean": 0.46990740299224854, + "rewards/accuracy_reward/std": 0.4996722638607025, + "rewards/format_reward/mean": 0.7767857313156128, + "rewards/format_reward/std": 0.41686636209487915, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16740036010742188, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1006.060302734375, + "completions/mean_terminated_length": 782.9891967773438, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.46539875339619624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14043072337143242, + "kl": 0.026641845703125, + "learning_rate": 6.814587716359366e-07, + "loss": 0.0795, + "num_tokens": 1253552691.0, + "reward": 2.2310268878936768, + "reward_std": 0.5390534400939941, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.8102678656578064, + "rewards/format_reward/std": 0.39252743124961853, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.17929750680923462, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 960.5245971679688, + "completions/mean_terminated_length": 741.86328125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.4656118480635022, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12791565349488476, + "kl": 0.0283203125, + "learning_rate": 6.811385723342583e-07, + "loss": 0.0547, + "num_tokens": 1254051726.0, + "reward": 2.213169813156128, + "reward_std": 0.5340933203697205, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.8080357313156128, + "rewards/format_reward/std": 0.3942854404449463, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16144777834415436, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 928.5870971679688, + "completions/mean_terminated_length": 688.9295654296875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.46582494273080816, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14244584010572325, + "kl": 0.027801513671875, + "learning_rate": 6.808183004574181e-07, + "loss": 0.05, + "num_tokens": 1254536389.0, + "reward": 2.2940850257873535, + "reward_std": 0.4687335789203644, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.8727678656578064, + "rewards/format_reward/std": 0.3336053788661957, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.17231273651123047, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 814.3995971679688, + "completions/mean_terminated_length": 700.0658569335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4660380373981141, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13774832495507758, + "kl": 0.03240966796875, + "learning_rate": 6.804979561826618e-07, + "loss": 0.0473, + "num_tokens": 1254973576.0, + "reward": 2.4056921005249023, + "reward_std": 0.5147993564605713, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3480229377746582, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.1706821471452713, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 722.8951416015625, + "completions/mean_terminated_length": 631.181396484375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.46625113206542007, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14612963584527716, + "kl": 0.0355224609375, + "learning_rate": 6.801775396872757e-07, + "loss": 0.0464, + "num_tokens": 1255367545.0, + "reward": 2.5401787757873535, + "reward_std": 0.49817097187042236, + "rewards/accuracy_reward/mean": 0.6852678656578064, + "rewards/accuracy_reward/std": 0.4649282991886139, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 885.0223388671875, + "completions/mean_terminated_length": 684.0889892578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46646422673272603, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15368045477938116, + "kl": 0.02923583984375, + "learning_rate": 6.798570511485854e-07, + "loss": 0.1468, + "num_tokens": 1255838931.0, + "reward": 2.369419813156128, + "reward_std": 0.48315125703811646, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18948033452033997, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 949.18310546875, + "completions/mean_terminated_length": 738.771240234375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.466677321400032, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12496597918471945, + "kl": 0.02685546875, + "learning_rate": 6.795364907439569e-07, + "loss": 0.0382, + "num_tokens": 1256329493.0, + "reward": 2.2154018878936768, + "reward_std": 0.4397036135196686, + "rewards/accuracy_reward/mean": 0.3616071343421936, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.2016845941543579, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 920.8928833007812, + "completions/mean_terminated_length": 708.6259765625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4668904160673379, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15477025607048256, + "kl": 0.0318603515625, + "learning_rate": 6.792158586507961e-07, + "loss": 0.0531, + "num_tokens": 1256807221.0, + "reward": 2.342076063156128, + "reward_std": 0.48274508118629456, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9358258843421936, + "rewards/tag_count_reward/std": 0.20187872648239136, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1001.4777221679688, + "completions/mean_terminated_length": 794.4118041992188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.46710351073464385, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1289801018572182, + "kl": 0.029205322265625, + "learning_rate": 6.788951550465483e-07, + "loss": 0.074, + "num_tokens": 1257321067.0, + "reward": 2.185267925262451, + "reward_std": 0.4656493663787842, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.16372442245483398, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 895.0223388671875, + "completions/mean_terminated_length": 723.5538940429688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4673166054019498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14544900586030074, + "kl": 0.03033447265625, + "learning_rate": 6.78574380108698e-07, + "loss": 0.0502, + "num_tokens": 1257791029.0, + "reward": 2.3526787757873535, + "reward_std": 0.4276007413864136, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1570618450641632, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 776.5714721679688, + "completions/mean_terminated_length": 634.6004638671875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.46752970006925576, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.4323689074603266, + "kl": 0.033233642578125, + "learning_rate": 6.782535340147702e-07, + "loss": 0.0367, + "num_tokens": 1258211429.0, + "reward": 2.37890625, + "reward_std": 0.3621874153614044, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18153512477874756, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 929.7567138671875, + "completions/mean_terminated_length": 739.9765014648438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4677427947365617, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12852082735331846, + "kl": 0.028472900390625, + "learning_rate": 6.779326169423279e-07, + "loss": 0.0633, + "num_tokens": 1258704456.0, + "reward": 2.4190850257873535, + "reward_std": 0.41469284892082214, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15756873786449432, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1060.6763916015625, + "completions/mean_terminated_length": 871.6143188476562, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4679558894038677, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12168705589983117, + "kl": 0.025054931640625, + "learning_rate": 6.776116290689748e-07, + "loss": 0.0785, + "num_tokens": 1259251447.0, + "reward": 2.322544813156128, + "reward_std": 0.4087196886539459, + "rewards/accuracy_reward/mean": 0.4236111044883728, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1474141627550125, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 920.044677734375, + "completions/mean_terminated_length": 768.69873046875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.46816898407117363, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16256680582418945, + "kl": 0.02911376953125, + "learning_rate": 6.772905705723527e-07, + "loss": 0.0758, + "num_tokens": 1259730587.0, + "reward": 2.454799175262451, + "reward_std": 0.46542397141456604, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18684886395931244, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 897.6629638671875, + "completions/mean_terminated_length": 709.4259643554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4683820787384796, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1358733775767159, + "kl": 0.02886962890625, + "learning_rate": 6.769694416301431e-07, + "loss": 0.089, + "num_tokens": 1260197908.0, + "reward": 2.5279018878936768, + "reward_std": 0.4194980561733246, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1573437601327896, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1025.546875, + "completions/mean_terminated_length": 754.0480346679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46859517340578555, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1267123037546443, + "kl": 0.02593994140625, + "learning_rate": 6.766482424200663e-07, + "loss": 0.106, + "num_tokens": 1260726265.0, + "reward": 2.396205425262451, + "reward_std": 0.44485175609588623, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.18929573893547058, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 885.732177734375, + "completions/mean_terminated_length": 712.882080078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.46880826807309145, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.21485344973014767, + "kl": 0.033203125, + "learning_rate": 6.763269731198813e-07, + "loss": 0.0604, + "num_tokens": 1261195905.0, + "reward": 2.5284600257873535, + "reward_std": 0.3913387060165405, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9709821343421936, + "rewards/format_reward/std": 0.16804419457912445, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1340305656194687, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 874.2098388671875, + "completions/mean_terminated_length": 730.0601196289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4690213627403974, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1435109214271408, + "kl": 0.029388427734375, + "learning_rate": 6.760056339073863e-07, + "loss": 0.0693, + "num_tokens": 1261659055.0, + "reward": 2.4559152126312256, + "reward_std": 0.42137643694877625, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 891.9531860351562, + "completions/mean_terminated_length": 740.1489868164062, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.46923445740770336, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13142701975802526, + "kl": 0.02764892578125, + "learning_rate": 6.756842249604176e-07, + "loss": 0.0653, + "num_tokens": 1262126938.0, + "reward": 2.5066964626312256, + "reward_std": 0.4629294276237488, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1457662582397461, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1027.982177734375, + "completions/mean_terminated_length": 835.88330078125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4694475520750093, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12128582979291838, + "kl": 0.025482177734375, + "learning_rate": 6.75362746456851e-07, + "loss": 0.0746, + "num_tokens": 1262659554.0, + "reward": 2.4213171005249023, + "reward_std": 0.3704850971698761, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9665178656578064, + "rewards/format_reward/std": 0.1800929754972458, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14945264160633087, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1002.310302734375, + "completions/mean_terminated_length": 771.5177001953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4696606467423153, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13393528617871056, + "kl": 0.02520751953125, + "learning_rate": 6.750411985745999e-07, + "loss": 0.0955, + "num_tokens": 1263174221.0, + "reward": 2.4330358505249023, + "reward_std": 0.4601520299911499, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17527315020561218, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 900.7433471679688, + "completions/mean_terminated_length": 753.3626708984375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.46987374140962124, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1366622981075897, + "kl": 0.02740478515625, + "learning_rate": 6.747195814916165e-07, + "loss": 0.0941, + "num_tokens": 1263652698.0, + "reward": 2.4614956378936768, + "reward_std": 0.34770047664642334, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9709821343421936, + "rewards/format_reward/std": 0.16804419457912445, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.1634640097618103, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 892.9464721679688, + "completions/mean_terminated_length": 760.776123046875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.4700868360769272, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14632863444342128, + "kl": 0.02874755859375, + "learning_rate": 6.74397895385891e-07, + "loss": 0.0854, + "num_tokens": 1264119874.0, + "reward": 2.5027902126312256, + "reward_std": 0.4269680380821228, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14827017486095428, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 769.7277221679688, + "completions/mean_terminated_length": 651.253662109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47029993074423315, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.156723752788265, + "kl": 0.0318603515625, + "learning_rate": 6.740761404354523e-07, + "loss": 0.066, + "num_tokens": 1264530840.0, + "reward": 2.4737725257873535, + "reward_std": 0.3658413290977478, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775348186493, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14149756729602814, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 845.279052734375, + "completions/mean_terminated_length": 687.345947265625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.47051302541153905, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14860018042050208, + "kl": 0.03118896484375, + "learning_rate": 6.737543168183671e-07, + "loss": 0.0812, + "num_tokens": 1264978661.0, + "reward": 2.5418527126312256, + "reward_std": 0.38505449891090393, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1648557037115097, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 824.2857666015625, + "completions/mean_terminated_length": 670.552734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.470726120078845, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13552912912935366, + "kl": 0.0296630859375, + "learning_rate": 6.734324247127402e-07, + "loss": 0.048, + "num_tokens": 1265417189.0, + "reward": 2.591517925262451, + "reward_std": 0.34443041682243347, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9776785969734192, + "rewards/format_reward/std": 0.1478918492794037, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1299937218427658, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 888.2410888671875, + "completions/mean_terminated_length": 745.8145141601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47093921474615097, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14408904699239972, + "kl": 0.03131103515625, + "learning_rate": 6.731104642967143e-07, + "loss": 0.1076, + "num_tokens": 1265882385.0, + "reward": 2.4458706378936768, + "reward_std": 0.4000353217124939, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1014.0781860351562, + "completions/mean_terminated_length": 771.9752197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4711523094134569, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13866595416311292, + "kl": 0.02679443359375, + "learning_rate": 6.727884357484695e-07, + "loss": 0.0759, + "num_tokens": 1266407444.0, + "reward": 2.381138563156128, + "reward_std": 0.36371123790740967, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14126798510551453, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 897.7277221679688, + "completions/mean_terminated_length": 749.9596557617188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4713654040807629, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12317114249750778, + "kl": 0.0274658203125, + "learning_rate": 6.724663392462241e-07, + "loss": 0.0458, + "num_tokens": 1266874906.0, + "reward": 2.443080425262451, + "reward_std": 0.327858567237854, + "rewards/accuracy_reward/mean": 0.5416666865348816, + "rewards/accuracy_reward/std": 0.49883854389190674, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1613859087228775, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 820.2120971679688, + "completions/mean_terminated_length": 651.9365234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47157849874806884, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.17319751460156835, + "kl": 0.03863525390625, + "learning_rate": 6.721441749682341e-07, + "loss": 0.0401, + "num_tokens": 1267311049.0, + "reward": 2.5340402126312256, + "reward_std": 0.36192551255226135, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 992.49560546875, + "completions/mean_terminated_length": 763.0380859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4717915934153748, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13811273338012478, + "kl": 0.02783203125, + "learning_rate": 6.718219430927924e-07, + "loss": 0.0779, + "num_tokens": 1267832919.0, + "reward": 2.4838171005249023, + "reward_std": 0.37745755910873413, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14323382079601288, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1001.3594360351562, + "completions/mean_terminated_length": 759.8269653320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47200468808268076, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12090751907188951, + "kl": 0.02569580078125, + "learning_rate": 6.714996437982301e-07, + "loss": 0.0855, + "num_tokens": 1268353544.0, + "reward": 2.388392925262451, + "reward_std": 0.44244250655174255, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 958.2388916015625, + "completions/mean_terminated_length": 756.4312133789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47221778274998666, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1353881829154153, + "kl": 0.028533935546875, + "learning_rate": 6.711772772629149e-07, + "loss": 0.0763, + "num_tokens": 1268847075.0, + "reward": 2.4363839626312256, + "reward_std": 0.3675597310066223, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389532685279846, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 775.1406860351562, + "completions/mean_terminated_length": 683.7870483398438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4724308774172926, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.17928233619134123, + "kl": 0.038177490234375, + "learning_rate": 6.708548436652522e-07, + "loss": 0.0627, + "num_tokens": 1269263170.0, + "reward": 2.48828125, + "reward_std": 0.38996705412864685, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1258348822593689, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 843.4219360351562, + "completions/mean_terminated_length": 681.794921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4726439720845986, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12890393591896093, + "kl": 0.032928466796875, + "learning_rate": 6.70532343183684e-07, + "loss": 0.0483, + "num_tokens": 1269707967.0, + "reward": 2.484375, + "reward_std": 0.3472144305706024, + "rewards/accuracy_reward/mean": 0.5810185074806213, + "rewards/accuracy_reward/std": 0.49396437406539917, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13422717154026031, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 881.9219360351562, + "completions/mean_terminated_length": 725.4607543945312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.47285706675190453, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14444850950268015, + "kl": 0.031494140625, + "learning_rate": 6.702097759966897e-07, + "loss": 0.097, + "num_tokens": 1270170892.0, + "reward": 2.46484375, + "reward_std": 0.3814384341239929, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.0911129042506218, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 778.966552734375, + "completions/mean_terminated_length": 674.746337890625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4730701614192105, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.16281714746927378, + "kl": 0.0347900390625, + "learning_rate": 6.698871422827857e-07, + "loss": 0.0518, + "num_tokens": 1270580365.0, + "reward": 2.4285714626312256, + "reward_std": 0.3740359842777252, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14433756470680237, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1029.1273193359375, + "completions/mean_terminated_length": 790.5482177734375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.47328325608651645, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13704965481849363, + "kl": 0.027740478515625, + "learning_rate": 6.695644422205252e-07, + "loss": 0.0744, + "num_tokens": 1271108262.0, + "reward": 2.3275671005249023, + "reward_std": 0.48285603523254395, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.19066503643989563, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 903.9085083007812, + "completions/mean_terminated_length": 760.1784057617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4734963507538224, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1342017231116232, + "kl": 0.03118896484375, + "learning_rate": 6.692416759884978e-07, + "loss": 0.088, + "num_tokens": 1271585245.0, + "reward": 2.3984375, + "reward_std": 0.41601473093032837, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12805373966693878, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 845.997802734375, + "completions/mean_terminated_length": 698.3834838867188, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.47370944542112836, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14661070479050456, + "kl": 0.031219482421875, + "learning_rate": 6.689188437653298e-07, + "loss": 0.0903, + "num_tokens": 1272027564.0, + "reward": 2.4693081378936768, + "reward_std": 0.42670324444770813, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15604011714458466, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 947.5402221679688, + "completions/mean_terminated_length": 827.6881103515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.47392254008843426, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12620281372711523, + "kl": 0.02490234375, + "learning_rate": 6.685959457296842e-07, + "loss": 0.0778, + "num_tokens": 1272524782.0, + "reward": 2.4263393878936768, + "reward_std": 0.4004477560520172, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1249600425362587, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 895.622802734375, + "completions/mean_terminated_length": 737.6827392578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4741356347557402, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1317392542986843, + "kl": 0.029998779296875, + "learning_rate": 6.682729820602605e-07, + "loss": 0.0906, + "num_tokens": 1272991237.0, + "reward": 2.4676339626312256, + "reward_std": 0.4688414931297302, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1286761611700058, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 975.05810546875, + "completions/mean_terminated_length": 802.72021484375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4743487294230462, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1256100173129875, + "kl": 0.025970458984375, + "learning_rate": 6.679499529357943e-07, + "loss": 0.1131, + "num_tokens": 1273498287.0, + "reward": 2.4386162757873535, + "reward_std": 0.4945499897003174, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14839327335357666, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 880.9553833007812, + "completions/mean_terminated_length": 707.3948974609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47456182409035214, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.40490642978055996, + "kl": 0.046295166015625, + "learning_rate": 6.676268585350571e-07, + "loss": 0.0894, + "num_tokens": 1273958555.0, + "reward": 2.359375, + "reward_std": 0.46043434739112854, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.17017030715942383, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1101.8660888671875, + "completions/mean_terminated_length": 873.8504028320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4747749187576581, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12181976137174877, + "kl": 0.023956298828125, + "learning_rate": 6.673036990368579e-07, + "loss": 0.0739, + "num_tokens": 1274526671.0, + "reward": 2.3002233505249023, + "reward_std": 0.4114280641078949, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14360485970973969, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 965.2745971679688, + "completions/mean_terminated_length": 774.8740234375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.47498801342496405, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14857873051432402, + "kl": 0.026885986328125, + "learning_rate": 6.669804746200396e-07, + "loss": 0.1138, + "num_tokens": 1275026266.0, + "reward": 2.2862725257873535, + "reward_std": 0.515708863735199, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1514935940504074, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 899.4866333007812, + "completions/mean_terminated_length": 721.8814086914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47520110809227, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.19132930027868855, + "kl": 0.034027099609375, + "learning_rate": 6.666571854634828e-07, + "loss": 0.1158, + "num_tokens": 1275502292.0, + "reward": 2.4302456378936768, + "reward_std": 0.4945792257785797, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692909002304, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 980.529052734375, + "completions/mean_terminated_length": 779.4933471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47541420275957597, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1371905268921433, + "kl": 0.027740478515625, + "learning_rate": 6.663338317461031e-07, + "loss": 0.0971, + "num_tokens": 1276005537.0, + "reward": 2.3582589626312256, + "reward_std": 0.5346171855926514, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.8794642686843872, + "rewards/format_reward/std": 0.3259509205818176, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16375111043453217, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1028.1875, + "completions/mean_terminated_length": 813.2000122070312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.47562729742688187, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12728763954654915, + "kl": 0.023345947265625, + "learning_rate": 6.660104136468524e-07, + "loss": 0.1045, + "num_tokens": 1276549685.0, + "reward": 2.232142925262451, + "reward_std": 0.4899819493293762, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.8638392686843872, + "rewards/format_reward/std": 0.34334254264831543, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15989768505096436, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 857.9710083007812, + "completions/mean_terminated_length": 708.4698486328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4758403920941878, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15562283024416393, + "kl": 0.03094482421875, + "learning_rate": 6.656869313447175e-07, + "loss": 0.1107, + "num_tokens": 1277002664.0, + "reward": 2.427455425262451, + "reward_std": 0.5242178440093994, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.8683035969734192, + "rewards/format_reward/std": 0.3385384678840637, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15374813973903656, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 927.2879638671875, + "completions/mean_terminated_length": 719.7486572265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4760534867614938, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1467290754377202, + "kl": 0.03033447265625, + "learning_rate": 6.653633850187211e-07, + "loss": 0.1399, + "num_tokens": 1277487673.0, + "reward": 2.33203125, + "reward_std": 0.5383273959159851, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.8504464030265808, + "rewards/format_reward/std": 0.3570319712162018, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.18044531345367432, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 857.8035888671875, + "completions/mean_terminated_length": 698.1063232421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.47626658142879974, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12760507720636563, + "kl": 0.029449462890625, + "learning_rate": 6.650397748479214e-07, + "loss": 0.0663, + "num_tokens": 1277943169.0, + "reward": 2.3621652126312256, + "reward_std": 0.4935818314552307, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.8638392686843872, + "rewards/format_reward/std": 0.34334254264831543, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1199323832988739, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 851.4754638671875, + "completions/mean_terminated_length": 721.160888671875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4764796760961057, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15966362366683684, + "kl": 0.031524658203125, + "learning_rate": 6.64716101011412e-07, + "loss": 0.1179, + "num_tokens": 1278393318.0, + "reward": 2.3621652126312256, + "reward_std": 0.5111830234527588, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16767139732837677, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 896.982177734375, + "completions/mean_terminated_length": 755.6290893554688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.47669277076341166, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12489564653119974, + "kl": 0.02880859375, + "learning_rate": 6.643923636883213e-07, + "loss": 0.0223, + "num_tokens": 1278859998.0, + "reward": 2.45703125, + "reward_std": 0.45318177342414856, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1293378323316574, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 931.982177734375, + "completions/mean_terminated_length": 739.1622924804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4769058654307176, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13247195221079344, + "kl": 0.02728271484375, + "learning_rate": 6.640685630578132e-07, + "loss": 0.0914, + "num_tokens": 1279346102.0, + "reward": 2.3325893878936768, + "reward_std": 0.5138127207756042, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3480229377746582, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 946.2277221679688, + "completions/mean_terminated_length": 749.0684814453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47711896009802357, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13901713735170673, + "kl": 0.02899169921875, + "learning_rate": 6.63744699299087e-07, + "loss": 0.1297, + "num_tokens": 1279836572.0, + "reward": 2.279017925262451, + "reward_std": 0.5506203174591064, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.8571428656578064, + "rewards/format_reward/std": 0.3503182828426361, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17999590933322906, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 994.8817138671875, + "completions/mean_terminated_length": 772.8729858398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47733205476532947, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14794792537225845, + "kl": 0.025238037109375, + "learning_rate": 6.634207725913759e-07, + "loss": 0.0665, + "num_tokens": 1280350503.0, + "reward": 2.2879464626312256, + "reward_std": 0.4296041429042816, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.315234512090683, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14767222106456757, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 971.8058471679688, + "completions/mean_terminated_length": 741.401123046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47754514943263543, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7126879225135037, + "kl": 0.12786865234375, + "learning_rate": 6.630967831139489e-07, + "loss": 0.1403, + "num_tokens": 1280864944.0, + "reward": 2.2310268878936768, + "reward_std": 0.5610305666923523, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.8147321343421936, + "rewards/format_reward/std": 0.38894903659820557, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18391697108745575, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 799.4799194335938, + "completions/mean_terminated_length": 673.7075805664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4777582440999414, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14736298039460333, + "kl": 0.033477783203125, + "learning_rate": 6.627727310461091e-07, + "loss": 0.0927, + "num_tokens": 1281282519.0, + "reward": 2.4921875, + "reward_std": 0.478099524974823, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1253739446401596, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 950.513427734375, + "completions/mean_terminated_length": 704.62841796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.47797133876724734, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1254623105078358, + "kl": 0.026214599609375, + "learning_rate": 6.624486165671948e-07, + "loss": 0.0536, + "num_tokens": 1281779117.0, + "reward": 2.37890625, + "reward_std": 0.39184245467185974, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780036240816116, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 967.5491333007812, + "completions/mean_terminated_length": 797.2454833984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4781844334345533, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1311891824439268, + "kl": 0.028594970703125, + "learning_rate": 6.621244398565784e-07, + "loss": 0.0573, + "num_tokens": 1282287523.0, + "reward": 2.4481027126312256, + "reward_std": 0.45720866322517395, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11938986927270889, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 875.310302734375, + "completions/mean_terminated_length": 683.4155883789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47839752810185926, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1210130054182694, + "kl": 0.0299072265625, + "learning_rate": 6.61800201093667e-07, + "loss": 0.0265, + "num_tokens": 1282750878.0, + "reward": 2.4185268878936768, + "reward_std": 0.4307468831539154, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11141301691532135, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 855.5714721679688, + "completions/mean_terminated_length": 719.1243896484375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4786106227691652, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14916156842532785, + "kl": 0.030120849609375, + "learning_rate": 6.614759004579019e-07, + "loss": 0.0991, + "num_tokens": 1283205358.0, + "reward": 2.4581475257873535, + "reward_std": 0.4494197964668274, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12109260261058807, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 819.6049194335938, + "completions/mean_terminated_length": 715.503662109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4788237174364712, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13958415695662107, + "kl": 0.031982421875, + "learning_rate": 6.611515381287584e-07, + "loss": 0.0495, + "num_tokens": 1283640861.0, + "reward": 2.474330425262451, + "reward_std": 0.42102277278900146, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12848198413848877, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 969.71435546875, + "completions/mean_terminated_length": 773.4037475585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4790368121037771, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.6560849333039577, + "kl": 0.031158447265625, + "learning_rate": 6.608271142857467e-07, + "loss": 0.0627, + "num_tokens": 1284153917.0, + "reward": 2.263951063156128, + "reward_std": 0.43909579515457153, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387789011001587, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.16219128668308258, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 934.8995971679688, + "completions/mean_terminated_length": 745.9921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.47924990677108303, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1441793425567929, + "kl": 0.028472900390625, + "learning_rate": 6.605026291084103e-07, + "loss": 0.0917, + "num_tokens": 1284651456.0, + "reward": 2.4464287757873535, + "reward_std": 0.43612557649612427, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1101.540283203125, + "completions/mean_terminated_length": 860.2857055664062, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.479463001438389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1268967009958678, + "kl": 0.024505615234375, + "learning_rate": 6.601780827763268e-07, + "loss": 0.0677, + "num_tokens": 1285216850.0, + "reward": 2.3370537757873535, + "reward_std": 0.5114197731018066, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.1634649783372879, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 959.3683471679688, + "completions/mean_terminated_length": 800.6675415039062, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.47967609610569495, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1359626959900769, + "kl": 0.025665283203125, + "learning_rate": 6.598534754691082e-07, + "loss": 0.0691, + "num_tokens": 1285728231.0, + "reward": 2.3270089626312256, + "reward_std": 0.45948970317840576, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15788237750530243, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 892.4866333007812, + "completions/mean_terminated_length": 685.7105712890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4798891907730009, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14413447586242895, + "kl": 0.031036376953125, + "learning_rate": 6.595288073663992e-07, + "loss": 0.0982, + "num_tokens": 1286192369.0, + "reward": 2.4620537757873535, + "reward_std": 0.442236989736557, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 896.0848388671875, + "completions/mean_terminated_length": 693.5170288085938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.48010228544030686, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13026983204389048, + "kl": 0.029022216796875, + "learning_rate": 6.592040786478794e-07, + "loss": 0.051, + "num_tokens": 1286660279.0, + "reward": 2.3560268878936768, + "reward_std": 0.4239169657230377, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3310886323451996, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.151918426156044, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1056.15625, + "completions/mean_terminated_length": 840.5380859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4803153801076128, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12667379774917495, + "kl": 0.024383544921875, + "learning_rate": 6.588792894932605e-07, + "loss": 0.0708, + "num_tokens": 1287205261.0, + "reward": 2.3526787757873535, + "reward_std": 0.48564672470092773, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17041954398155212, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 883.3973388671875, + "completions/mean_terminated_length": 706.7609252929688, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4805284747749188, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1438943400326947, + "kl": 0.03204345703125, + "learning_rate": 6.585544400822891e-07, + "loss": 0.0701, + "num_tokens": 1287671823.0, + "reward": 2.3309152126312256, + "reward_std": 0.37643617391586304, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.151995450258255, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1002.7567138671875, + "completions/mean_terminated_length": 785.8193969726562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4807415694422247, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13384297494057554, + "kl": 0.026580810546875, + "learning_rate": 6.582295305947442e-07, + "loss": 0.1164, + "num_tokens": 1288191362.0, + "reward": 2.435826063156128, + "reward_std": 0.4654217064380646, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1590748131275177, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 856.7857666015625, + "completions/mean_terminated_length": 723.771728515625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.48095466410953064, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15303098879110114, + "kl": 0.033416748046875, + "learning_rate": 6.579045612104384e-07, + "loss": 0.1397, + "num_tokens": 1288646898.0, + "reward": 2.415736675262451, + "reward_std": 0.48180699348449707, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.1983288675546646, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 986.3058471679688, + "completions/mean_terminated_length": 755.5027465820312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4811677587768366, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13559269148845762, + "kl": 0.0257568359375, + "learning_rate": 6.575795321092173e-07, + "loss": 0.1157, + "num_tokens": 1289157803.0, + "reward": 2.5189733505249023, + "reward_std": 0.41055232286453247, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13399910926818848, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 882.5469360351562, + "completions/mean_terminated_length": 722.814697265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48138085344414255, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14875046399111191, + "kl": 0.030609130859375, + "learning_rate": 6.572544434709597e-07, + "loss": 0.064, + "num_tokens": 1289619376.0, + "reward": 2.5011162757873535, + "reward_std": 0.40965041518211365, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.49251341819763184, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1236090287566185, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 923.3660888671875, + "completions/mean_terminated_length": 715.1005249023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4815939481114485, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1331591315388688, + "kl": 0.029998779296875, + "learning_rate": 6.569292954755773e-07, + "loss": 0.0709, + "num_tokens": 1290097284.0, + "reward": 2.416294813156128, + "reward_std": 0.431540310382843, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1631094515323639, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 960.6295166015625, + "completions/mean_terminated_length": 782.6961059570312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.48180704277875447, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14204432141140563, + "kl": 0.027252197265625, + "learning_rate": 6.566040883030146e-07, + "loss": 0.1012, + "num_tokens": 1290598254.0, + "reward": 2.388392925262451, + "reward_std": 0.39278843998908997, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14767220616340637, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 968.8125610351562, + "completions/mean_terminated_length": 723.4082641601562, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4820201374460604, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14223632467210331, + "kl": 0.030517578125, + "learning_rate": 6.562788221332488e-07, + "loss": 0.1363, + "num_tokens": 1291103626.0, + "reward": 2.4659600257873535, + "reward_std": 0.4136297404766083, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.16194480657577515, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 868.3482666015625, + "completions/mean_terminated_length": 730.0848388671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4822332321133664, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1250078107907124, + "kl": 0.02972412109375, + "learning_rate": 6.559534971462901e-07, + "loss": 0.1066, + "num_tokens": 1291559462.0, + "reward": 2.3995537757873535, + "reward_std": 0.32701024413108826, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 945.6160888671875, + "completions/mean_terminated_length": 807.1256103515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48244632678067234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13173478626303223, + "kl": 0.028564453125, + "learning_rate": 6.556281135221806e-07, + "loss": 0.0344, + "num_tokens": 1292047194.0, + "reward": 2.421875, + "reward_std": 0.456134170293808, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15220166742801666, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 889.4085083007812, + "completions/mean_terminated_length": 727.2646484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48265942144797824, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15084527769165776, + "kl": 0.031585693359375, + "learning_rate": 6.553026714409954e-07, + "loss": 0.1309, + "num_tokens": 1292514417.0, + "reward": 2.4877233505249023, + "reward_std": 0.4870128035545349, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17070867121219635, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 872.9263916015625, + "completions/mean_terminated_length": 725.3040161132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4828725161152842, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15039152057329863, + "kl": 0.03240966796875, + "learning_rate": 6.549771710828418e-07, + "loss": 0.0873, + "num_tokens": 1292976240.0, + "reward": 2.5078125, + "reward_std": 0.4377419054508209, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13607001304626465, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 934.4397583007812, + "completions/mean_terminated_length": 788.2146606445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48308561078259016, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1460816354359092, + "kl": 0.027740478515625, + "learning_rate": 6.546516126278594e-07, + "loss": 0.0892, + "num_tokens": 1293462037.0, + "reward": 2.3074777126312256, + "reward_std": 0.3869589567184448, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 860.8281860351562, + "completions/mean_terminated_length": 724.9825439453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4832987054498961, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1325369125378957, + "kl": 0.02899169921875, + "learning_rate": 6.543259962562196e-07, + "loss": 0.0666, + "num_tokens": 1293912584.0, + "reward": 2.435826063156128, + "reward_std": 0.3635029196739197, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.126290425658226, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1024.3170166015625, + "completions/mean_terminated_length": 805.1544799804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4835118001172021, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13027425240007753, + "kl": 0.02423095703125, + "learning_rate": 6.540003221481267e-07, + "loss": 0.1066, + "num_tokens": 1294445366.0, + "reward": 2.3582589626312256, + "reward_std": 0.4220695197582245, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14886364340782166, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 908.9263916015625, + "completions/mean_terminated_length": 712.123046875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.48372489478450803, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13369689805636678, + "kl": 0.02984619140625, + "learning_rate": 6.536745904838158e-07, + "loss": 0.0618, + "num_tokens": 1294923365.0, + "reward": 2.3973214626312256, + "reward_std": 0.4122970402240753, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14140157401561737, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 886.2991333007812, + "completions/mean_terminated_length": 706.6546020507812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.483937989451814, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.19779333977512292, + "kl": 0.033447265625, + "learning_rate": 6.533488014435547e-07, + "loss": 0.0849, + "num_tokens": 1295385067.0, + "reward": 2.493861675262451, + "reward_std": 0.41481515765190125, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14149756729602814, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 847.2835083007812, + "completions/mean_terminated_length": 709.8880615234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48415108411911995, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13033102980848388, + "kl": 0.03125, + "learning_rate": 6.530229552076428e-07, + "loss": 0.1271, + "num_tokens": 1295830746.0, + "reward": 2.4849331378936768, + "reward_std": 0.3559582829475403, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.13252247869968414, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1057.149658203125, + "completions/mean_terminated_length": 835.1557006835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48436417878642585, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1312186455198644, + "kl": 0.026397705078125, + "learning_rate": 6.526970519564109e-07, + "loss": 0.0979, + "num_tokens": 1296374317.0, + "reward": 2.3404018878936768, + "reward_std": 0.48946699500083923, + "rewards/accuracy_reward/mean": 0.4722222089767456, + "rewards/accuracy_reward/std": 0.49980661273002625, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.161169171333313, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 902.6138916015625, + "completions/mean_terminated_length": 722.074951171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4845772734537318, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.19680341486418076, + "kl": 0.0323486328125, + "learning_rate": 6.523710918702215e-07, + "loss": 0.1105, + "num_tokens": 1296853168.0, + "reward": 2.3777902126312256, + "reward_std": 0.4936728775501251, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14052370190620422, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1042.13623046875, + "completions/mean_terminated_length": 760.4942626953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48479036812103776, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14608437731654025, + "kl": 0.024688720703125, + "learning_rate": 6.520450751294685e-07, + "loss": 0.1392, + "num_tokens": 1297396829.0, + "reward": 2.302455425262451, + "reward_std": 0.5085361003875732, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17337898910045624, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 821.185302734375, + "completions/mean_terminated_length": 670.5238037109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4850034627883437, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14799577977550182, + "kl": 0.03216552734375, + "learning_rate": 6.517190019145773e-07, + "loss": 0.0952, + "num_tokens": 1297835984.0, + "reward": 2.3487725257873535, + "reward_std": 0.4308519959449768, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15514148771762848, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 800.7969360351562, + "completions/mean_terminated_length": 647.631591796875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.4852165574556497, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14685880925108288, + "kl": 0.0330810546875, + "learning_rate": 6.513928724060046e-07, + "loss": 0.0677, + "num_tokens": 1298262997.0, + "reward": 2.4871652126312256, + "reward_std": 0.41233643889427185, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.16169020533561707, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 859.8772583007812, + "completions/mean_terminated_length": 727.2084350585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48542965212295563, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13130114987824393, + "kl": 0.030181884765625, + "learning_rate": 6.510666867842378e-07, + "loss": 0.102, + "num_tokens": 1298717022.0, + "reward": 2.4910714626312256, + "reward_std": 0.4890372157096863, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.13043475151062012, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 921.4420166015625, + "completions/mean_terminated_length": 757.2122802734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4856427467902616, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12233424875395836, + "kl": 0.028045654296875, + "learning_rate": 6.50740445229796e-07, + "loss": 0.0814, + "num_tokens": 1299202340.0, + "reward": 2.470982313156128, + "reward_std": 0.43373480439186096, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.09686583280563354, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 961.2344360351562, + "completions/mean_terminated_length": 805.9821166992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48585584145756755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13977673098951304, + "kl": 0.0272216796875, + "learning_rate": 6.504141479232287e-07, + "loss": 0.0755, + "num_tokens": 1299704349.0, + "reward": 2.415736675262451, + "reward_std": 0.5184842348098755, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.14720545709133148, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 911.2813110351562, + "completions/mean_terminated_length": 742.2307739257812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48606893612487345, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13926447540570908, + "kl": 0.03070068359375, + "learning_rate": 6.500877950451167e-07, + "loss": 0.0714, + "num_tokens": 1300178395.0, + "reward": 2.3794643878936768, + "reward_std": 0.42163488268852234, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995505809783936, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14040927588939667, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 867.8170166015625, + "completions/mean_terminated_length": 695.7698364257812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4862820307921794, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13024994210825913, + "kl": 0.03045654296875, + "learning_rate": 6.497613867760711e-07, + "loss": 0.0084, + "num_tokens": 1300635369.0, + "reward": 2.4107143878936768, + "reward_std": 0.4235752820968628, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12672585248947144, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 955.3772583007812, + "completions/mean_terminated_length": 721.455322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48649512545948537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15891920959067704, + "kl": 0.032745361328125, + "learning_rate": 6.494349232967341e-07, + "loss": 0.098, + "num_tokens": 1301134674.0, + "reward": 2.3387277126312256, + "reward_std": 0.5449174046516418, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390644073486, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.17089413106441498, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 896.4375610351562, + "completions/mean_terminated_length": 672.2666625976562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4867082201267913, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.154094563296589, + "kl": 0.0306396484375, + "learning_rate": 6.491084047877781e-07, + "loss": 0.1142, + "num_tokens": 1301608598.0, + "reward": 2.4129464626312256, + "reward_std": 0.4288695752620697, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 890.4219360351562, + "completions/mean_terminated_length": 721.6700439453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4869213147940973, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14596596261185982, + "kl": 0.031707763671875, + "learning_rate": 6.487818314299062e-07, + "loss": 0.0928, + "num_tokens": 1302072915.0, + "reward": 2.388951063156128, + "reward_std": 0.474895715713501, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1448889821767807, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1012.6027221679688, + "completions/mean_terminated_length": 780.62841796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48713440946140324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1552924602429743, + "kl": 0.02838134765625, + "learning_rate": 6.484552034038515e-07, + "loss": 0.1305, + "num_tokens": 1302600577.0, + "reward": 2.3097100257873535, + "reward_std": 0.5640348196029663, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.17044061422348022, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 900.8504638671875, + "completions/mean_terminated_length": 723.4561767578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4873475041287092, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1473554784091451, + "kl": 0.029571533203125, + "learning_rate": 6.481285208903781e-07, + "loss": 0.0817, + "num_tokens": 1303072190.0, + "reward": 2.4213171005249023, + "reward_std": 0.40599721670150757, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13927440345287323, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 901.6094360351562, + "completions/mean_terminated_length": 751.0732421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48756059879601515, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13925707693131345, + "kl": 0.02801513671875, + "learning_rate": 6.478017840702793e-07, + "loss": 0.0772, + "num_tokens": 1303547167.0, + "reward": 2.4838171005249023, + "reward_std": 0.459294855594635, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13359208405017853, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 948.21435546875, + "completions/mean_terminated_length": 768.2493286132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48777369346332106, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12104823943059272, + "kl": 0.0294189453125, + "learning_rate": 6.474749931243791e-07, + "loss": 0.0461, + "num_tokens": 1304044303.0, + "reward": 2.4302456378936768, + "reward_std": 0.4273103177547455, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13311463594436646, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 825.0335083007812, + "completions/mean_terminated_length": 660.9392700195312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.487986788130627, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14631506269878528, + "kl": 0.032806396484375, + "learning_rate": 6.471481482335315e-07, + "loss": 0.0983, + "num_tokens": 1304481998.0, + "reward": 2.501674175262451, + "reward_std": 0.4627715051174164, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1064.107177734375, + "completions/mean_terminated_length": 833.718994140625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.48819988279793297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13191893932490797, + "kl": 0.025848388671875, + "learning_rate": 6.468212495786196e-07, + "loss": 0.1085, + "num_tokens": 1305027390.0, + "reward": 2.3309152126312256, + "reward_std": 0.5329724550247192, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17015472054481506, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 945.8906860351562, + "completions/mean_terminated_length": 741.7962646484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48841297746523893, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14282174751279508, + "kl": 0.02899169921875, + "learning_rate": 6.464942973405573e-07, + "loss": 0.1024, + "num_tokens": 1305519341.0, + "reward": 2.4174108505249023, + "reward_std": 0.477356493473053, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855977296829224, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.15415765345096588, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 914.9933471679688, + "completions/mean_terminated_length": 733.0077514648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4886260721325449, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1352025359548213, + "kl": 0.02703857421875, + "learning_rate": 6.461672917002873e-07, + "loss": 0.0956, + "num_tokens": 1305994138.0, + "reward": 2.515625, + "reward_std": 0.4342189133167267, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10688953846693039, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 971.7678833007812, + "completions/mean_terminated_length": 789.1174926757812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48883916679985084, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1448599229780544, + "kl": 0.0264892578125, + "learning_rate": 6.458402328387826e-07, + "loss": 0.0982, + "num_tokens": 1306502194.0, + "reward": 2.3113839626312256, + "reward_std": 0.4006907045841217, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12975822389125824, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 971.2813110351562, + "completions/mean_terminated_length": 751.3064575195312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4890522614671568, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1124431648248687, + "kl": 0.026092529296875, + "learning_rate": 6.455131209370447e-07, + "loss": 0.0613, + "num_tokens": 1307005232.0, + "reward": 2.5072546005249023, + "reward_std": 0.3755997121334076, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10500273108482361, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 920.185302734375, + "completions/mean_terminated_length": 755.7723999023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48926535613446276, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12673188322453885, + "kl": 0.02935791015625, + "learning_rate": 6.451859561761054e-07, + "loss": 0.0314, + "num_tokens": 1307484531.0, + "reward": 2.4888393878936768, + "reward_std": 0.37483373284339905, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13731664419174194, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 968.794677734375, + "completions/mean_terminated_length": 751.7962646484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.48947845080176866, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1263471003296918, + "kl": 0.02716064453125, + "learning_rate": 6.448587387370249e-07, + "loss": 0.0986, + "num_tokens": 1307987831.0, + "reward": 2.466517925262451, + "reward_std": 0.45208361744880676, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 922.1116333007812, + "completions/mean_terminated_length": 734.4635620117188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4896915454690746, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 31.967076941628456, + "kl": 1.966094970703125, + "learning_rate": 6.445314688008937e-07, + "loss": 0.1528, + "num_tokens": 1308476441.0, + "reward": 2.5078125, + "reward_std": 0.4202045202255249, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12695714831352234, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 908.279052734375, + "completions/mean_terminated_length": 752.0736083984375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4899046401363806, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1648802877687333, + "kl": 0.033477783203125, + "learning_rate": 6.442041465488301e-07, + "loss": 0.1023, + "num_tokens": 1308953478.0, + "reward": 2.474330425262451, + "reward_std": 0.42755141854286194, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09310024231672287, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1010.341552734375, + "completions/mean_terminated_length": 798.3468017578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49011773480368653, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10344181008345053, + "kl": 0.026031494140625, + "learning_rate": 6.438767721619824e-07, + "loss": 0.0688, + "num_tokens": 1309476623.0, + "reward": 2.5284600257873535, + "reward_std": 0.3568534255027771, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.12170960009098053, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 941.357177734375, + "completions/mean_terminated_length": 756.9166870117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4903308294709925, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13165597797358203, + "kl": 0.029388427734375, + "learning_rate": 6.435493458215266e-07, + "loss": 0.04, + "num_tokens": 1309970783.0, + "reward": 2.4720983505249023, + "reward_std": 0.3915558159351349, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266101151704788, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 954.4866333007812, + "completions/mean_terminated_length": 748.54638671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49054392413829845, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1352858919809436, + "kl": 0.03277587890625, + "learning_rate": 6.432218677086686e-07, + "loss": 0.0793, + "num_tokens": 1310461657.0, + "reward": 2.396763563156128, + "reward_std": 0.41674861311912537, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13463464379310608, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 956.9308471679688, + "completions/mean_terminated_length": 784.9534912109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4907570188056044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14683353277306666, + "kl": 0.029388427734375, + "learning_rate": 6.428943380046423e-07, + "loss": 0.0518, + "num_tokens": 1310969530.0, + "reward": 2.291294813156128, + "reward_std": 0.45143747329711914, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1804911345243454, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1023.1942138671875, + "completions/mean_terminated_length": 769.1336669921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49097011347291036, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13616806672736026, + "kl": 0.025421142578125, + "learning_rate": 6.425667568907105e-07, + "loss": 0.0815, + "num_tokens": 1311499345.0, + "reward": 2.3325893878936768, + "reward_std": 0.448726087808609, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1500372737646103, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 984.7813110351562, + "completions/mean_terminated_length": 764.1132202148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49118320814021627, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12283919084358452, + "kl": 0.0263671875, + "learning_rate": 6.42239124548164e-07, + "loss": 0.066, + "num_tokens": 1312010639.0, + "reward": 2.236049175262451, + "reward_std": 0.35402610898017883, + "rewards/accuracy_reward/mean": 0.3348214328289032, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12245608866214752, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 899.6451416015625, + "completions/mean_terminated_length": 683.3766479492188, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4913963028075222, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13587041197781646, + "kl": 0.031402587890625, + "learning_rate": 6.419114411583224e-07, + "loss": 0.0826, + "num_tokens": 1312486080.0, + "reward": 2.3755581378936768, + "reward_std": 0.415954053401947, + "rewards/accuracy_reward/mean": 0.5185185074806213, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1217813789844513, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 878.8214721679688, + "completions/mean_terminated_length": 721.9443359375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4916093974748282, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14225775126932688, + "kl": 0.028900146484375, + "learning_rate": 6.415837069025335e-07, + "loss": 0.0645, + "num_tokens": 1312954400.0, + "reward": 2.392299175262451, + "reward_std": 0.46852749586105347, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10481233894824982, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 912.38623046875, + "completions/mean_terminated_length": 705.6385498046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49182249214213414, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15059949397848466, + "kl": 0.034088134765625, + "learning_rate": 6.412559219621728e-07, + "loss": 0.0879, + "num_tokens": 1313434541.0, + "reward": 2.3878350257873535, + "reward_std": 0.4745412766933441, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16570167243480682, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 924.5625610351562, + "completions/mean_terminated_length": 789.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4920355868094401, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12203006358251214, + "kl": 0.027801513671875, + "learning_rate": 6.409280865186444e-07, + "loss": 0.1011, + "num_tokens": 1313915561.0, + "reward": 2.4246652126312256, + "reward_std": 0.5094877481460571, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1328611820936203, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1003.7344360351562, + "completions/mean_terminated_length": 762.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.49224868147674605, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10814766218822099, + "kl": 0.024810791015625, + "learning_rate": 6.406002007533799e-07, + "loss": 0.0687, + "num_tokens": 1314436162.0, + "reward": 2.439732313156128, + "reward_std": 0.43360665440559387, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12782442569732666, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1040.3504638671875, + "completions/mean_terminated_length": 758.2085571289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.492461776144052, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1248546146688111, + "kl": 0.025543212890625, + "learning_rate": 6.402722648478394e-07, + "loss": 0.1076, + "num_tokens": 1314971919.0, + "reward": 2.2862725257873535, + "reward_std": 0.5106185674667358, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16907276213169098, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 932.4866333007812, + "completions/mean_terminated_length": 756.6563720703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49267487081135797, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1378317299999715, + "kl": 0.027618408203125, + "learning_rate": 6.3994427898351e-07, + "loss": 0.0544, + "num_tokens": 1315459993.0, + "reward": 2.46484375, + "reward_std": 0.4626796245574951, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875497817993164, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 897.294677734375, + "completions/mean_terminated_length": 715.9173583984375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.49288796547866387, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13567760955839048, + "kl": 0.028106689453125, + "learning_rate": 6.396162433419068e-07, + "loss": 0.0805, + "num_tokens": 1315933133.0, + "reward": 2.463169813156128, + "reward_std": 0.3479953706264496, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.10171286761760712, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 809.825927734375, + "completions/mean_terminated_length": 685.0958251953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4931010601459698, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14932765027112538, + "kl": 0.0360107421875, + "learning_rate": 6.392881581045722e-07, + "loss": 0.1164, + "num_tokens": 1316361119.0, + "reward": 2.6160714626312256, + "reward_std": 0.46530845761299133, + "rewards/accuracy_reward/mean": 0.7276785969734192, + "rewards/accuracy_reward/std": 0.4456520974636078, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15546400845050812, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1097.2098388671875, + "completions/mean_terminated_length": 809.7615966796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4933141548132758, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12658388234149398, + "kl": 0.0245361328125, + "learning_rate": 6.389600234530767e-07, + "loss": 0.0777, + "num_tokens": 1316928925.0, + "reward": 2.2650671005249023, + "reward_std": 0.4566171169281006, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.17044061422348022, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 871.9174194335938, + "completions/mean_terminated_length": 720.833740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49352724948058174, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1392635750197061, + "kl": 0.030517578125, + "learning_rate": 6.386318395690178e-07, + "loss": 0.1008, + "num_tokens": 1317386856.0, + "reward": 2.4598214626312256, + "reward_std": 0.4671509265899658, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15456202626228333, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 939.8594360351562, + "completions/mean_terminated_length": 744.989501953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4937403441478877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13604923951682954, + "kl": 0.025909423828125, + "learning_rate": 6.383036066340196e-07, + "loss": 0.102, + "num_tokens": 1317867241.0, + "reward": 2.4637277126312256, + "reward_std": 0.47998321056365967, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14177079498767853, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 981.9464721679688, + "completions/mean_terminated_length": 767.592529296875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.49395343881519366, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11954228812420696, + "kl": 0.023834228515625, + "learning_rate": 6.379753248297341e-07, + "loss": 0.1069, + "num_tokens": 1318381681.0, + "reward": 2.3716518878936768, + "reward_std": 0.4848964810371399, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.15348808467388153, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 813.8013916015625, + "completions/mean_terminated_length": 641.0763549804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4941665334824996, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1456615441567663, + "kl": 0.03265380859375, + "learning_rate": 6.376469943378405e-07, + "loss": 0.0554, + "num_tokens": 1318815352.0, + "reward": 2.4408483505249023, + "reward_std": 0.4136618971824646, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 931.2188110351562, + "completions/mean_terminated_length": 713.8186645507812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.4943796281498056, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14327714887387763, + "kl": 0.027862548828125, + "learning_rate": 6.373186153400441e-07, + "loss": 0.1063, + "num_tokens": 1319307274.0, + "reward": 2.322544813156128, + "reward_std": 0.41019299626350403, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15822990238666534, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1009.1585083007812, + "completions/mean_terminated_length": 806.9306640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4945927228171115, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11449580660970503, + "kl": 0.0244140625, + "learning_rate": 6.36990188018078e-07, + "loss": 0.0088, + "num_tokens": 1319828593.0, + "reward": 2.2544643878936768, + "reward_std": 0.41315004229545593, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14619384706020355, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 987.7232666015625, + "completions/mean_terminated_length": 814.223388671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49480581748441743, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12978094996689116, + "kl": 0.028564453125, + "learning_rate": 6.366617125537013e-07, + "loss": 0.0457, + "num_tokens": 1320343989.0, + "reward": 2.3978796005249023, + "reward_std": 0.45256125926971436, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1420171558856964, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 917.4285888671875, + "completions/mean_terminated_length": 729.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4950189121517234, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13803362656728765, + "kl": 0.0286865234375, + "learning_rate": 6.363331891287002e-07, + "loss": 0.0524, + "num_tokens": 1320820373.0, + "reward": 2.4056921005249023, + "reward_std": 0.48847725987434387, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.17171035706996918, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 951.9754638671875, + "completions/mean_terminated_length": 745.5623168945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49523200681902935, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12980912994963212, + "kl": 0.027252197265625, + "learning_rate": 6.360046179248868e-07, + "loss": 0.0789, + "num_tokens": 1321311018.0, + "reward": 2.3939733505249023, + "reward_std": 0.4923938512802124, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920448660850525, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17634892463684082, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 999.3750610351562, + "completions/mean_terminated_length": 750.254150390625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4954451014863353, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13489212029771006, + "kl": 0.02777099609375, + "learning_rate": 6.356759991241008e-07, + "loss": 0.0452, + "num_tokens": 1321831906.0, + "reward": 2.349888563156128, + "reward_std": 0.5136919021606445, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 978.8170166015625, + "completions/mean_terminated_length": 810.2894287109375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.49565819615364126, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13341273547399218, + "kl": 0.028564453125, + "learning_rate": 6.353473329082072e-07, + "loss": 0.0555, + "num_tokens": 1322332976.0, + "reward": 2.4029018878936768, + "reward_std": 0.4221830666065216, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1569942682981491, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 918.5402221679688, + "completions/mean_terminated_length": 753.887451171875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4958712908209472, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11686818221012897, + "kl": 0.029022216796875, + "learning_rate": 6.350186194590974e-07, + "loss": 0.0789, + "num_tokens": 1322809298.0, + "reward": 2.41015625, + "reward_std": 0.408230185508728, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14936910569667816, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1084.9241943359375, + "completions/mean_terminated_length": 829.1920776367188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4960843854882532, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1188251701991562, + "kl": 0.024169921875, + "learning_rate": 6.346898589586897e-07, + "loss": 0.0768, + "num_tokens": 1323368512.0, + "reward": 2.3175225257873535, + "reward_std": 0.40248122811317444, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.4966535270214081, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1217813789844513, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 831.9933471679688, + "completions/mean_terminated_length": 679.2286376953125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.49629748015555913, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1572690744195642, + "kl": 0.03399658203125, + "learning_rate": 6.34361051588927e-07, + "loss": 0.0795, + "num_tokens": 1323813069.0, + "reward": 2.321986675262451, + "reward_std": 0.42697685956954956, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13700605928897858, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 935.9732666015625, + "completions/mean_terminated_length": 733.5198364257812, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.49651057482286504, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.148269396194928, + "kl": 0.02978515625, + "learning_rate": 6.3403219753178e-07, + "loss": 0.0906, + "num_tokens": 1324304689.0, + "reward": 2.390625, + "reward_std": 0.47408512234687805, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 951.01123046875, + "completions/mean_terminated_length": 730.43701171875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.496723669490171, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13564127958370173, + "kl": 0.026824951171875, + "learning_rate": 6.337032969692436e-07, + "loss": 0.0765, + "num_tokens": 1324800774.0, + "reward": 2.3722100257873535, + "reward_std": 0.44210124015808105, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15578390657901764, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 953.7835083007812, + "completions/mean_terminated_length": 754.5725708007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.49693676415747695, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.25274424419260066, + "kl": 0.03350830078125, + "learning_rate": 6.333743500833392e-07, + "loss": 0.0719, + "num_tokens": 1325302645.0, + "reward": 2.376674175262451, + "reward_std": 0.37470710277557373, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12359261512756348, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 879.247802734375, + "completions/mean_terminated_length": 764.6642456054688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4971498588247829, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.137855395878548, + "kl": 0.030426025390625, + "learning_rate": 6.330453570561138e-07, + "loss": 0.044, + "num_tokens": 1325765556.0, + "reward": 2.463169813156128, + "reward_std": 0.41422560811042786, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11752051115036011, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1048.6875, + "completions/mean_terminated_length": 790.438232421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49736295349208887, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13968529825377668, + "kl": 0.024658203125, + "learning_rate": 6.327163180696401e-07, + "loss": 0.0946, + "num_tokens": 1326309016.0, + "reward": 2.3270089626312256, + "reward_std": 0.4345180094242096, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13865114748477936, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 929.6317138671875, + "completions/mean_terminated_length": 726.0238037109375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4975760481593948, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1552874175575847, + "kl": 0.031646728515625, + "learning_rate": 6.323872333060154e-07, + "loss": 0.0401, + "num_tokens": 1326798563.0, + "reward": 2.353236675262451, + "reward_std": 0.4689759612083435, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14050592482089996, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 893.5513916015625, + "completions/mean_terminated_length": 711.583984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4977891428267008, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.21169147350488637, + "kl": 0.035552978515625, + "learning_rate": 6.320581029473636e-07, + "loss": 0.0795, + "num_tokens": 1327278026.0, + "reward": 2.4302456378936768, + "reward_std": 0.4072837233543396, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10962118953466415, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 967.4866333007812, + "completions/mean_terminated_length": 793.9326171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49800223749400674, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13081704392667673, + "kl": 0.027801513671875, + "learning_rate": 6.317289271758325e-07, + "loss": 0.0858, + "num_tokens": 1327785748.0, + "reward": 2.3833706378936768, + "reward_std": 0.4488319456577301, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661914110183716, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 930.1964721679688, + "completions/mean_terminated_length": 760.6580810546875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.49821533216131264, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1377822455684268, + "kl": 0.02850341796875, + "learning_rate": 6.313997061735963e-07, + "loss": 0.0522, + "num_tokens": 1328274940.0, + "reward": 2.392857313156128, + "reward_std": 0.4331909418106079, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10649171471595764, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 918.3438110351562, + "completions/mean_terminated_length": 705.5968017578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4984284268286186, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14210385122744254, + "kl": 0.02813720703125, + "learning_rate": 6.310704401228532e-07, + "loss": 0.1131, + "num_tokens": 1328757014.0, + "reward": 2.427455425262451, + "reward_std": 0.40171927213668823, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920448660850525, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12848198413848877, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1033.1160888671875, + "completions/mean_terminated_length": 812.4891357421875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.49864152149592456, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12023240623064611, + "kl": 0.023101806640625, + "learning_rate": 6.30741129205827e-07, + "loss": 0.0497, + "num_tokens": 1329292602.0, + "reward": 2.3175225257873535, + "reward_std": 0.39013761281967163, + "rewards/accuracy_reward/mean": 0.42592594027519226, + "rewards/accuracy_reward/std": 0.4950558841228485, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1506175547838211, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 975.044677734375, + "completions/mean_terminated_length": 723.8016357421875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4988546161632305, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13396653816847484, + "kl": 0.02801513671875, + "learning_rate": 6.304117736047659e-07, + "loss": 0.0276, + "num_tokens": 1329803662.0, + "reward": 2.3705358505249023, + "reward_std": 0.40252885222435, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12951265275478363, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 998.52685546875, + "completions/mean_terminated_length": 790.8770141601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49906771083053647, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11945604638024607, + "kl": 0.026702880859375, + "learning_rate": 6.300823735019432e-07, + "loss": 0.0643, + "num_tokens": 1330320218.0, + "reward": 2.34765625, + "reward_std": 0.4534340798854828, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.1706821471452713, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 989.6875610351562, + "completions/mean_terminated_length": 770.0377197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49928080549784243, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13630784708423205, + "kl": 0.02728271484375, + "learning_rate": 6.297529290796565e-07, + "loss": 0.1159, + "num_tokens": 1330836766.0, + "reward": 2.2700893878936768, + "reward_std": 0.47162166237831116, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387789011001587, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1423168033361435, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 823.0402221679688, + "completions/mean_terminated_length": 696.3201904296875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.4994939001651484, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14460942127160265, + "kl": 0.03515625, + "learning_rate": 6.294234405202281e-07, + "loss": 0.0283, + "num_tokens": 1331271696.0, + "reward": 2.4542412757873535, + "reward_std": 0.47995179891586304, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18162193894386292, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 761.1295166015625, + "completions/mean_terminated_length": 665.4628295898438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.49970699483245434, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15260264162445966, + "kl": 0.03515625, + "learning_rate": 6.290939080060047e-07, + "loss": 0.0809, + "num_tokens": 1331675578.0, + "reward": 2.5027902126312256, + "reward_std": 0.47969430685043335, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.13839317858219147, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1068.529052734375, + "completions/mean_terminated_length": 822.2932739257812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.49992008949976025, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12303925519756481, + "kl": 0.02471923828125, + "learning_rate": 6.287643317193575e-07, + "loss": 0.0618, + "num_tokens": 1332226167.0, + "reward": 2.196986675262451, + "reward_std": 0.4591960906982422, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067808747291565, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.1634487360715866, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 959.2879638671875, + "completions/mean_terminated_length": 757.674560546875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5001331841670662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1302247454302692, + "kl": 0.028167724609375, + "learning_rate": 6.284347118426813e-07, + "loss": 0.096, + "num_tokens": 1332729464.0, + "reward": 2.302455425262451, + "reward_std": 0.49819159507751465, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3480229377746582, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.2105279266834259, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 987.310302734375, + "completions/mean_terminated_length": 826.4344482421875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5003462788343722, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11648255671874307, + "kl": 0.027435302734375, + "learning_rate": 6.28105048558396e-07, + "loss": 0.0277, + "num_tokens": 1333237555.0, + "reward": 2.306361675262451, + "reward_std": 0.45896437764167786, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12691166996955872, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1088.7679443359375, + "completions/mean_terminated_length": 844.2577514648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5005593735016781, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12653991344412277, + "kl": 0.023590087890625, + "learning_rate": 6.277753420489447e-07, + "loss": 0.0603, + "num_tokens": 1333795339.0, + "reward": 2.2645089626312256, + "reward_std": 0.4586235582828522, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 854.919677734375, + "completions/mean_terminated_length": 718.3980102539062, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.500772468168984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14901950347911191, + "kl": 0.03173828125, + "learning_rate": 6.274455924967946e-07, + "loss": 0.0713, + "num_tokens": 1334247527.0, + "reward": 2.53125, + "reward_std": 0.4887952208518982, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 999.841552734375, + "completions/mean_terminated_length": 805.7380981445312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.50098556283629, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1216090166935823, + "kl": 0.02642822265625, + "learning_rate": 6.271158000844374e-07, + "loss": 0.0745, + "num_tokens": 1334772848.0, + "reward": 2.3956475257873535, + "reward_std": 0.4995259642601013, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17260229587554932, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 908.4754638671875, + "completions/mean_terminated_length": 728.8604736328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5011986575035959, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12732142572870545, + "kl": 0.02923583984375, + "learning_rate": 6.267859649943872e-07, + "loss": 0.0641, + "num_tokens": 1335247541.0, + "reward": 2.4481027126312256, + "reward_std": 0.4220970571041107, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14566238224506378, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 911.6451416015625, + "completions/mean_terminated_length": 742.6487426757812, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.501411752170902, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1387441313573059, + "kl": 0.028533935546875, + "learning_rate": 6.26456087409183e-07, + "loss": 0.1202, + "num_tokens": 1335724838.0, + "reward": 2.4425225257873535, + "reward_std": 0.4729068875312805, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13124457001686096, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 988.6629638671875, + "completions/mean_terminated_length": 821.6873779296875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5016248468382078, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1262241091810573, + "kl": 0.02801513671875, + "learning_rate": 6.261261675113866e-07, + "loss": 0.0112, + "num_tokens": 1336234591.0, + "reward": 2.294642925262451, + "reward_std": 0.51183021068573, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.8794642686843872, + "rewards/format_reward/std": 0.3259509205818176, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.1671055108308792, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 934.0848388671875, + "completions/mean_terminated_length": 745.0391845703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5018379415055139, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13346042355819976, + "kl": 0.028564453125, + "learning_rate": 6.257962054835835e-07, + "loss": 0.0434, + "num_tokens": 1336722805.0, + "reward": 2.420201063156128, + "reward_std": 0.4596903920173645, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13230563700199127, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1079.15625, + "completions/mean_terminated_length": 858.8438720703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5020510361728198, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12441041586216225, + "kl": 0.024444580078125, + "learning_rate": 6.254662015083822e-07, + "loss": 0.084, + "num_tokens": 1337282459.0, + "reward": 2.3683037757873535, + "reward_std": 0.4865238070487976, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13840332627296448, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1033.044677734375, + "completions/mean_terminated_length": 802.24658203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5022641308401258, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.123684119602041, + "kl": 0.025421142578125, + "learning_rate": 6.25136155768415e-07, + "loss": 0.0923, + "num_tokens": 1337815359.0, + "reward": 2.4090402126312256, + "reward_std": 0.4522753059864044, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1098259910941124, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 937.1473388671875, + "completions/mean_terminated_length": 791.2777709960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5024772255074317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16593296882699007, + "kl": 0.02984619140625, + "learning_rate": 6.248060684463366e-07, + "loss": 0.1772, + "num_tokens": 1338304129.0, + "reward": 2.4760046005249023, + "reward_std": 0.5160124897956848, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.17038200795650482, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1074.984375, + "completions/mean_terminated_length": 873.0377197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5026903201747376, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11381415365444424, + "kl": 0.023406982421875, + "learning_rate": 6.244759397248253e-07, + "loss": 0.0653, + "num_tokens": 1338861402.0, + "reward": 2.5234375, + "reward_std": 0.4501384198665619, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.216333270072937, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14262787997722626, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1037.3773193359375, + "completions/mean_terminated_length": 821.0108642578125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5029034148420436, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.24337235481109948, + "kl": 0.0264892578125, + "learning_rate": 6.24145769786582e-07, + "loss": 0.0592, + "num_tokens": 1339403443.0, + "reward": 2.30078125, + "reward_std": 0.39265257120132446, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.1038430780172348, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 925.5402221679688, + "completions/mean_terminated_length": 748.614990234375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.5031165095093495, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15258267265923403, + "kl": 0.030731201171875, + "learning_rate": 6.238155588143306e-07, + "loss": 0.1103, + "num_tokens": 1339888053.0, + "reward": 2.4949777126312256, + "reward_std": 0.47983109951019287, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15968577563762665, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1096.1875, + "completions/mean_terminated_length": 853.5686645507812, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.5033296041766555, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12329117046407867, + "kl": 0.023681640625, + "learning_rate": 6.234853069908174e-07, + "loss": 0.1047, + "num_tokens": 1340450473.0, + "reward": 2.3521206378936768, + "reward_std": 0.5603268146514893, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.18919269740581512, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1001.3013916015625, + "completions/mean_terminated_length": 817.2362060546875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5035426988439614, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13599712589471538, + "kl": 0.0308837890625, + "learning_rate": 6.231550144988116e-07, + "loss": 0.042, + "num_tokens": 1340972128.0, + "reward": 2.4425225257873535, + "reward_std": 0.43468981981277466, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10892428457736969, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 931.5089721679688, + "completions/mean_terminated_length": 778.4873046875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.5037557935112674, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1423457383477721, + "kl": 0.028076171875, + "learning_rate": 6.22824681521105e-07, + "loss": 0.063, + "num_tokens": 1341454644.0, + "reward": 2.446986675262451, + "reward_std": 0.43831828236579895, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1477556824684143, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1103.044677734375, + "completions/mean_terminated_length": 795.5148315429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5039688881785733, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13752616984853727, + "kl": 0.025115966796875, + "learning_rate": 6.224943082405112e-07, + "loss": 0.0759, + "num_tokens": 1342025288.0, + "reward": 2.1316964626312256, + "reward_std": 0.3737923800945282, + "rewards/accuracy_reward/mean": 0.2433035671710968, + "rewards/accuracy_reward/std": 0.42955654859542847, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1605832874774933, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1047.94873046875, + "completions/mean_terminated_length": 806.9390258789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5041819828458792, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13662966443401997, + "kl": 0.02508544921875, + "learning_rate": 6.221638948398668e-07, + "loss": 0.0395, + "num_tokens": 1342565329.0, + "reward": 2.345424175262451, + "reward_std": 0.3844912648200989, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11214316636323929, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 854.0267944335938, + "completions/mean_terminated_length": 686.9312744140625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5043950775131852, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14258892542209364, + "kl": 0.03070068359375, + "learning_rate": 6.218334415020303e-07, + "loss": 0.0649, + "num_tokens": 1343016685.0, + "reward": 2.4771206378936768, + "reward_std": 0.4083729684352875, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.150824636220932, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1038.0625, + "completions/mean_terminated_length": 815.1607666015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5046081721804911, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13867943430506147, + "kl": 0.02685546875, + "learning_rate": 6.215029484098823e-07, + "loss": 0.0755, + "num_tokens": 1343545321.0, + "reward": 2.3208706378936768, + "reward_std": 0.4246404767036438, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.16107910871505737, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 913.7545166015625, + "completions/mean_terminated_length": 748.4041137695312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5048212668477972, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13797999918245624, + "kl": 0.03057861328125, + "learning_rate": 6.211724157463254e-07, + "loss": 0.0728, + "num_tokens": 1344024971.0, + "reward": 2.5044643878936768, + "reward_std": 0.42501553893089294, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12337145209312439, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1000.0335083007812, + "completions/mean_terminated_length": 754.6419067382812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5050343615151031, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1396032795565846, + "kl": 0.025665283203125, + "learning_rate": 6.208418436942842e-07, + "loss": 0.1175, + "num_tokens": 1344545210.0, + "reward": 2.44921875, + "reward_std": 0.4854455888271332, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1076.82373046875, + "completions/mean_terminated_length": 846.1022338867188, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5052474561824091, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13677886177496132, + "kl": 0.025909423828125, + "learning_rate": 6.20511232436705e-07, + "loss": 0.0472, + "num_tokens": 1345099723.0, + "reward": 2.380580425262451, + "reward_std": 0.4701564908027649, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14457522332668304, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 981.6607666015625, + "completions/mean_terminated_length": 780.8381958007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.505460550849715, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1531722602951443, + "kl": 0.0318603515625, + "learning_rate": 6.20180582156556e-07, + "loss": 0.117, + "num_tokens": 1345611107.0, + "reward": 2.3677456378936768, + "reward_std": 0.538916826248169, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.19221055507659912, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 980.1339721679688, + "completions/mean_terminated_length": 730.0826416015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.505673645517021, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13844762772271924, + "kl": 0.0267333984375, + "learning_rate": 6.198498930368264e-07, + "loss": 0.0562, + "num_tokens": 1346127151.0, + "reward": 2.420201063156128, + "reward_std": 0.4536563456058502, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15941192209720612, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1028.4710693359375, + "completions/mean_terminated_length": 764.9971923828125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5058867401843269, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11792195244194115, + "kl": 0.0281982421875, + "learning_rate": 6.195191652605277e-07, + "loss": 0.1015, + "num_tokens": 1346653682.0, + "reward": 2.369419813156128, + "reward_std": 0.4254020154476166, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1569942682981491, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 888.1920166015625, + "completions/mean_terminated_length": 752.25439453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5060998348516328, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.17919901234290608, + "kl": 0.035888671875, + "learning_rate": 6.191883990106922e-07, + "loss": 0.0487, + "num_tokens": 1347114392.0, + "reward": 2.544642925262451, + "reward_std": 0.35875949263572693, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.11074429750442505, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 885.6942138671875, + "completions/mean_terminated_length": 729.7392578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5063129295189388, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13241347842841886, + "kl": 0.03155517578125, + "learning_rate": 6.188575944703737e-07, + "loss": 0.0906, + "num_tokens": 1347584879.0, + "reward": 2.4542412757873535, + "reward_std": 0.4288511574268341, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15750236809253693, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 927.419677734375, + "completions/mean_terminated_length": 760.7692260742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5065260241862447, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12657821327459, + "kl": 0.029937744140625, + "learning_rate": 6.185267518226472e-07, + "loss": 0.0782, + "num_tokens": 1348071419.0, + "reward": 2.439732313156128, + "reward_std": 0.39730075001716614, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12198750674724579, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 894.3370971679688, + "completions/mean_terminated_length": 691.4619140625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5067391188535507, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1377727527660299, + "kl": 0.03436279296875, + "learning_rate": 6.181958712506091e-07, + "loss": 0.0527, + "num_tokens": 1348534162.0, + "reward": 2.451451063156128, + "reward_std": 0.4207465350627899, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.11946304887533188, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1035.65625, + "completions/mean_terminated_length": 815.58154296875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5069522135208566, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12292191967576041, + "kl": 0.0257568359375, + "learning_rate": 6.178649529373762e-07, + "loss": 0.0542, + "num_tokens": 1349068536.0, + "reward": 2.337611675262451, + "reward_std": 0.41124528646469116, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1515430212020874, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1014.47998046875, + "completions/mean_terminated_length": 819.8381958007812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5071653081881626, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1357666458468901, + "kl": 0.025360107421875, + "learning_rate": 6.175339970660862e-07, + "loss": 0.0693, + "num_tokens": 1349590703.0, + "reward": 2.427455425262451, + "reward_std": 0.4426553547382355, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15374813973903656, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 835.1473388671875, + "completions/mean_terminated_length": 689.60498046875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5073784028554685, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13977011980293846, + "kl": 0.031951904296875, + "learning_rate": 6.172030038198984e-07, + "loss": 0.1366, + "num_tokens": 1350025937.0, + "reward": 2.53515625, + "reward_std": 0.42308393120765686, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.4769369065761566, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15578389167785645, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 954.72998046875, + "completions/mean_terminated_length": 792.1410522460938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5075914975227744, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13697717831623799, + "kl": 0.028228759765625, + "learning_rate": 6.168719733819918e-07, + "loss": 0.0753, + "num_tokens": 1350520584.0, + "reward": 2.318638563156128, + "reward_std": 0.5162521600723267, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13259780406951904, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 859.7857666015625, + "completions/mean_terminated_length": 720.5187377929688, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.5078045921900805, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13573834732359238, + "kl": 0.030670166015625, + "learning_rate": 6.165409059355666e-07, + "loss": 0.0844, + "num_tokens": 1350973288.0, + "reward": 2.6026787757873535, + "reward_std": 0.443278431892395, + "rewards/accuracy_reward/mean": 0.7075892686843872, + "rewards/accuracy_reward/std": 0.4553784728050232, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1249600425362587, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1007.1785888671875, + "completions/mean_terminated_length": 756.3434448242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5080176868573864, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14652890677079772, + "kl": 0.02813720703125, + "learning_rate": 6.16209801663843e-07, + "loss": 0.0985, + "num_tokens": 1351503736.0, + "reward": 2.2589287757873535, + "reward_std": 0.477927029132843, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.2068338543176651, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1012.9844360351562, + "completions/mean_terminated_length": 767.0967407226562, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5082307815246924, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1325062666358616, + "kl": 0.027435302734375, + "learning_rate": 6.158786607500624e-07, + "loss": 0.077, + "num_tokens": 1352022545.0, + "reward": 2.3939733505249023, + "reward_std": 0.3978215157985687, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14553911983966827, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1146.946533203125, + "completions/mean_terminated_length": 871.1137084960938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5084438761919983, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11355119041496733, + "kl": 0.0230712890625, + "learning_rate": 6.155474833774854e-07, + "loss": 0.0825, + "num_tokens": 1352618105.0, + "reward": 2.3002233505249023, + "reward_std": 0.4399286210536957, + "rewards/accuracy_reward/mean": 0.4236111044883728, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1478201001882553, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1025.97998046875, + "completions/mean_terminated_length": 783.1795654296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5086569708593043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13476401574469568, + "kl": 0.027862548828125, + "learning_rate": 6.152162697293939e-07, + "loss": 0.0697, + "num_tokens": 1353152944.0, + "reward": 2.3521206378936768, + "reward_std": 0.48104655742645264, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16961827874183655, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 915.5670166015625, + "completions/mean_terminated_length": 726.828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5088700655266102, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13599772013034264, + "kl": 0.030059814453125, + "learning_rate": 6.148850199890888e-07, + "loss": 0.0472, + "num_tokens": 1353632222.0, + "reward": 2.5206475257873535, + "reward_std": 0.4059658646583557, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1354389488697052, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 994.9553833007812, + "completions/mean_terminated_length": 741.1744995117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5090831601939162, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13624447988477673, + "kl": 0.031158447265625, + "learning_rate": 6.145537343398917e-07, + "loss": 0.0782, + "num_tokens": 1354138970.0, + "reward": 2.44140625, + "reward_std": 0.4164377450942993, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.12992529571056366, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 969.8839721679688, + "completions/mean_terminated_length": 687.4478759765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5092962548612221, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15657589692656326, + "kl": 0.027435302734375, + "learning_rate": 6.142224129651437e-07, + "loss": 0.105, + "num_tokens": 1354643750.0, + "reward": 2.4140625, + "reward_std": 0.41708022356033325, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14457522332668304, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 970.4219360351562, + "completions/mean_terminated_length": 750.271484375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.509509349528528, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13944631200538407, + "kl": 0.02764892578125, + "learning_rate": 6.13891056048206e-07, + "loss": 0.1195, + "num_tokens": 1355150803.0, + "reward": 2.3638393878936768, + "reward_std": 0.5155957937240601, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17786091566085815, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1011.93310546875, + "completions/mean_terminated_length": 806.9358520507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.509722444195834, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12983141815870317, + "kl": 0.026641845703125, + "learning_rate": 6.135596637724592e-07, + "loss": 0.0762, + "num_tokens": 1355668341.0, + "reward": 2.3621652126312256, + "reward_std": 0.4232122600078583, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 896.7344360351562, + "completions/mean_terminated_length": 697.8245849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5099355388631399, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12616642169598924, + "kl": 0.0284423828125, + "learning_rate": 6.132282363213037e-07, + "loss": 0.0519, + "num_tokens": 1356132110.0, + "reward": 2.4927456378936768, + "reward_std": 0.3441968262195587, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1372518539428711, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1033.97998046875, + "completions/mean_terminated_length": 810.1771240234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5101486335304459, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12834055217632015, + "kl": 0.028045654296875, + "learning_rate": 6.12896773878159e-07, + "loss": 0.0901, + "num_tokens": 1356667909.0, + "reward": 2.3588171005249023, + "reward_std": 0.4196842312812805, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18230371177196503, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 903.7723388671875, + "completions/mean_terminated_length": 743.638671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5103617281977518, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.133934563703041, + "kl": 0.02923583984375, + "learning_rate": 6.125652766264644e-07, + "loss": 0.0956, + "num_tokens": 1357138815.0, + "reward": 2.4949777126312256, + "reward_std": 0.37598463892936707, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13206008076667786, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1075.7879638671875, + "completions/mean_terminated_length": 789.1820678710938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5105748228650578, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1359943249877691, + "kl": 0.026336669921875, + "learning_rate": 6.122337447496781e-07, + "loss": 0.1118, + "num_tokens": 1357688656.0, + "reward": 2.2488839626312256, + "reward_std": 0.43310675024986267, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1709425151348114, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 926.5245971679688, + "completions/mean_terminated_length": 722.3509521484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5107879175323637, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13621738131296754, + "kl": 0.03009033203125, + "learning_rate": 6.119021784312776e-07, + "loss": 0.1081, + "num_tokens": 1358179595.0, + "reward": 2.4737725257873535, + "reward_std": 0.4345763623714447, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15496434271335602, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 872.3951416015625, + "completions/mean_terminated_length": 704.4515380859375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5110010121996698, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12710492159804265, + "kl": 0.02874755859375, + "learning_rate": 6.115705778547597e-07, + "loss": 0.0557, + "num_tokens": 1358640892.0, + "reward": 2.3978796005249023, + "reward_std": 0.38329121470451355, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14827017486095428, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1062.529052734375, + "completions/mean_terminated_length": 831.7713623046875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5112141068669757, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1184153102791087, + "kl": 0.025970458984375, + "learning_rate": 6.112389432036395e-07, + "loss": 0.0731, + "num_tokens": 1359184265.0, + "reward": 2.380580425262451, + "reward_std": 0.4913359582424164, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1709425151348114, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 910.0067138671875, + "completions/mean_terminated_length": 692.0930786132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5114272015342816, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13303574340662616, + "kl": 0.0286865234375, + "learning_rate": 6.10907274661452e-07, + "loss": 0.0502, + "num_tokens": 1359662748.0, + "reward": 2.4564733505249023, + "reward_std": 0.38364139199256897, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13083133101463318, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 983.044677734375, + "completions/mean_terminated_length": 758.5405883789062, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5116402962015876, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1399615806852762, + "kl": 0.028350830078125, + "learning_rate": 6.105755724117497e-07, + "loss": 0.09, + "num_tokens": 1360173136.0, + "reward": 2.467076063156128, + "reward_std": 0.4884949326515198, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.18169322609901428, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1043.5, + "completions/mean_terminated_length": 818.4480590820312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5118533908688935, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13474856976283414, + "kl": 0.0252685546875, + "learning_rate": 6.10243836638105e-07, + "loss": 0.0628, + "num_tokens": 1360712992.0, + "reward": 2.2393975257873535, + "reward_std": 0.4417692720890045, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.4822677969932556, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.1887698918581009, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 891.5558471679688, + "completions/mean_terminated_length": 756.0125122070312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5120664855361995, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14205893620096882, + "kl": 0.0302734375, + "learning_rate": 6.099120675241078e-07, + "loss": 0.0808, + "num_tokens": 1361183561.0, + "reward": 2.4955358505249023, + "reward_std": 0.5023698210716248, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17367035150527954, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 925.49560546875, + "completions/mean_terminated_length": 755.2442016601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5122795802035054, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1517751241020233, + "kl": 0.029937744140625, + "learning_rate": 6.095802652533673e-07, + "loss": 0.0806, + "num_tokens": 1361669079.0, + "reward": 2.4481027126312256, + "reward_std": 0.4166407883167267, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1032.8125, + "completions/mean_terminated_length": 863.6146240234375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5124926748708114, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.7832429674798569, + "kl": 0.055450439453125, + "learning_rate": 6.092484300095104e-07, + "loss": 0.0586, + "num_tokens": 1362205123.0, + "reward": 2.349330425262451, + "reward_std": 0.44817838072776794, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14969991147518158, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 985.232177734375, + "completions/mean_terminated_length": 757.701904296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5127057695381173, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15157403944599057, + "kl": 0.028656005859375, + "learning_rate": 6.089165619761825e-07, + "loss": 0.1251, + "num_tokens": 1362718299.0, + "reward": 2.27734375, + "reward_std": 0.5197800993919373, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9447544813156128, + "rewards/tag_count_reward/std": 0.1910770684480667, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 921.7991333007812, + "completions/mean_terminated_length": 786.6549682617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5129188642054232, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14860180174126825, + "kl": 0.028839111328125, + "learning_rate": 6.085846613370473e-07, + "loss": 0.1141, + "num_tokens": 1363199425.0, + "reward": 2.431919813156128, + "reward_std": 0.5232210159301758, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18007564544677734, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 930.58935546875, + "completions/mean_terminated_length": 702.3010864257812, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5131319588727292, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1408336181338977, + "kl": 0.030487060546875, + "learning_rate": 6.082527282757862e-07, + "loss": 0.0754, + "num_tokens": 1363694777.0, + "reward": 2.3208706378936768, + "reward_std": 0.41732296347618103, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.1706821471452713, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 994.2656860351562, + "completions/mean_terminated_length": 825.012939453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5133450535400351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1520453280721132, + "kl": 0.030548095703125, + "learning_rate": 6.079207629760989e-07, + "loss": 0.0806, + "num_tokens": 1364204112.0, + "reward": 2.361607313156128, + "reward_std": 0.4760262966156006, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16487936675548553, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 996.357177734375, + "completions/mean_terminated_length": 814.65966796875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5135581482073411, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14086777766932815, + "kl": 0.02728271484375, + "learning_rate": 6.075887656217029e-07, + "loss": 0.0983, + "num_tokens": 1364719376.0, + "reward": 2.4095983505249023, + "reward_std": 0.4826275110244751, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1591111123561859, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1041.247802734375, + "completions/mean_terminated_length": 791.6629638671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.513771242874647, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13193189319816837, + "kl": 0.02691650390625, + "learning_rate": 6.072567363963331e-07, + "loss": 0.0692, + "num_tokens": 1365256735.0, + "reward": 2.349888563156128, + "reward_std": 0.445743590593338, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16229133307933807, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 809.6406860351562, + "completions/mean_terminated_length": 701.4344482421875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.513984337541953, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14569900434973557, + "kl": 0.0335693359375, + "learning_rate": 6.069246754837424e-07, + "loss": 0.0381, + "num_tokens": 1365686622.0, + "reward": 2.4341518878936768, + "reward_std": 0.38321271538734436, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12247265130281448, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1078.6004638671875, + "completions/mean_terminated_length": 841.6361083984375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.514197432209259, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11673080216470279, + "kl": 0.0252685546875, + "learning_rate": 6.065925830677007e-07, + "loss": 0.1091, + "num_tokens": 1366244747.0, + "reward": 2.3214287757873535, + "reward_std": 0.5216419100761414, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18981274962425232, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 930.8616333007812, + "completions/mean_terminated_length": 748.05712890625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.514410526876565, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13250732675582602, + "kl": 0.02764892578125, + "learning_rate": 6.062604593319964e-07, + "loss": 0.0402, + "num_tokens": 1366729469.0, + "reward": 2.4034600257873535, + "reward_std": 0.3924359083175659, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 916.5156860351562, + "completions/mean_terminated_length": 754.875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5146236215438709, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12954069060231613, + "kl": 0.027984619140625, + "learning_rate": 6.059283044604342e-07, + "loss": 0.088, + "num_tokens": 1367207988.0, + "reward": 2.4386162757873535, + "reward_std": 0.4345270097255707, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15481625497341156, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 993.7076416015625, + "completions/mean_terminated_length": 757.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5148367162111768, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1332248091862224, + "kl": 0.028533935546875, + "learning_rate": 6.055961186368364e-07, + "loss": 0.0312, + "num_tokens": 1367725537.0, + "reward": 2.3839287757873535, + "reward_std": 0.46096891164779663, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13840332627296448, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 927.5379638671875, + "completions/mean_terminated_length": 727.0342407226562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5150498108784828, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12682075155253925, + "kl": 0.02947998046875, + "learning_rate": 6.052639020450424e-07, + "loss": 0.0662, + "num_tokens": 1368211986.0, + "reward": 2.44140625, + "reward_std": 0.45993292331695557, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1008.44873046875, + "completions/mean_terminated_length": 809.3856201171875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5152629055457887, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14025416724026074, + "kl": 0.029052734375, + "learning_rate": 6.049316548689087e-07, + "loss": 0.0925, + "num_tokens": 1368729563.0, + "reward": 2.345982313156128, + "reward_std": 0.450383722782135, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1826944798231125, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 970.3527221679688, + "completions/mean_terminated_length": 746.6900024414062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5154760002130947, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1348113331352353, + "kl": 0.027862548828125, + "learning_rate": 6.045993772923087e-07, + "loss": 0.0928, + "num_tokens": 1369232809.0, + "reward": 2.3113839626312256, + "reward_std": 0.4495818614959717, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.1974812150001526, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 868.5469360351562, + "completions/mean_terminated_length": 752.9142456054688, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.5156890948804006, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14470634778148325, + "kl": 0.0338134765625, + "learning_rate": 6.042670694991326e-07, + "loss": 0.0867, + "num_tokens": 1369686254.0, + "reward": 2.537388563156128, + "reward_std": 0.4362529516220093, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1212.32373046875, + "completions/mean_terminated_length": 930.4388427734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5159021895477066, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11663098067290441, + "kl": 0.022735595703125, + "learning_rate": 6.039347316732874e-07, + "loss": 0.0621, + "num_tokens": 1370311103.0, + "reward": 2.2315850257873535, + "reward_std": 0.4198766052722931, + "rewards/accuracy_reward/mean": 0.3482142984867096, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1372518688440323, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 959.732177734375, + "completions/mean_terminated_length": 810.5786743164062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5161152842150125, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12176151342103846, + "kl": 0.029266357421875, + "learning_rate": 6.036023639986963e-07, + "loss": 0.066, + "num_tokens": 1370812423.0, + "reward": 2.4955358505249023, + "reward_std": 0.42358651757240295, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13731664419174194, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1063.2232666015625, + "completions/mean_terminated_length": 819.0863647460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5163283788823184, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13180252181338825, + "kl": 0.0263671875, + "learning_rate": 6.032699666593001e-07, + "loss": 0.0941, + "num_tokens": 1371356011.0, + "reward": 2.2857143878936768, + "reward_std": 0.45496076345443726, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14903545379638672, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 976.1428833007812, + "completions/mean_terminated_length": 764.064208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5165414735496244, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.17003535714402265, + "kl": 0.02728271484375, + "learning_rate": 6.029375398390545e-07, + "loss": 0.1389, + "num_tokens": 1371861723.0, + "reward": 2.330357313156128, + "reward_std": 0.4743497371673584, + "rewards/accuracy_reward/mean": 0.44675925374031067, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14238695800304413, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 999.6339721679688, + "completions/mean_terminated_length": 805.4920654296875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5167545682169303, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1282537811590294, + "kl": 0.02587890625, + "learning_rate": 6.026050837219327e-07, + "loss": 0.0752, + "num_tokens": 1372383623.0, + "reward": 2.364955425262451, + "reward_std": 0.4266434609889984, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13763901591300964, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 945.263427734375, + "completions/mean_terminated_length": 723.5335083007812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5169676628842363, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12431075887668937, + "kl": 0.029022216796875, + "learning_rate": 6.022725984919235e-07, + "loss": 0.0535, + "num_tokens": 1372877533.0, + "reward": 2.3978796005249023, + "reward_std": 0.40170004963874817, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14590215682983398, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 878.1964721679688, + "completions/mean_terminated_length": 753.9951171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5171807575515422, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13918978694888412, + "kl": 0.031768798828125, + "learning_rate": 6.019400843330323e-07, + "loss": 0.0859, + "num_tokens": 1373337493.0, + "reward": 2.517857313156128, + "reward_std": 0.4215749502182007, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12561768293380737, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 945.8460083007812, + "completions/mean_terminated_length": 778.6812133789062, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5173938522188483, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13742465278040455, + "kl": 0.029937744140625, + "learning_rate": 6.016075414292802e-07, + "loss": 0.0524, + "num_tokens": 1373823184.0, + "reward": 2.4916296005249023, + "reward_std": 0.43216538429260254, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.17153577506542206, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1038.1116943359375, + "completions/mean_terminated_length": 847.92041015625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5176069468861542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1317303705896818, + "kl": 0.02667236328125, + "learning_rate": 6.01274969964704e-07, + "loss": 0.068, + "num_tokens": 1374357874.0, + "reward": 2.372767925262451, + "reward_std": 0.4326055645942688, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13526484370231628, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1053.7232666015625, + "completions/mean_terminated_length": 863.3297729492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5178200415534602, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12317320852729442, + "kl": 0.02532958984375, + "learning_rate": 6.009423701233567e-07, + "loss": 0.0684, + "num_tokens": 1374903814.0, + "reward": 2.4614956378936768, + "reward_std": 0.4561314582824707, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15894921123981476, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1038.88623046875, + "completions/mean_terminated_length": 842.4453125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.5180331362207661, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13316676861104781, + "kl": 0.0272216796875, + "learning_rate": 6.006097420893069e-07, + "loss": 0.0981, + "num_tokens": 1375437779.0, + "reward": 2.392857313156128, + "reward_std": 0.41668689250946045, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.18555572628974915, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 842.5111694335938, + "completions/mean_terminated_length": 733.9878540039062, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.518246230888072, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13232592861974118, + "kl": 0.029754638671875, + "learning_rate": 6.002770860466386e-07, + "loss": 0.0682, + "num_tokens": 1375885064.0, + "reward": 2.5396206378936768, + "reward_std": 0.378508597612381, + "rewards/accuracy_reward/mean": 0.6388888955116272, + "rewards/accuracy_reward/std": 0.480879545211792, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.1137678399682045, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 930.9420166015625, + "completions/mean_terminated_length": 784.257568359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.518459325555378, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1410193043983278, + "kl": 0.028961181640625, + "learning_rate": 5.999444021794517e-07, + "loss": 0.0676, + "num_tokens": 1376366958.0, + "reward": 2.490513563156128, + "reward_std": 0.45106399059295654, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13826683163642883, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 956.58935546875, + "completions/mean_terminated_length": 781.2849731445312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5186724202226839, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13035982669917046, + "kl": 0.02764892578125, + "learning_rate": 5.99611690671861e-07, + "loss": 0.0704, + "num_tokens": 1376859382.0, + "reward": 2.4676339626312256, + "reward_std": 0.44317126274108887, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16545002162456512, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 863.466552734375, + "completions/mean_terminated_length": 662.43603515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5188855148899899, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1726558136881807, + "kl": 0.03240966796875, + "learning_rate": 5.99278951707997e-07, + "loss": 0.1274, + "num_tokens": 1377306823.0, + "reward": 2.38671875, + "reward_std": 0.43243926763534546, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1625526398420334, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 870.4910888671875, + "completions/mean_terminated_length": 722.5628051757812, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5190986095572958, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1461447707298886, + "kl": 0.029937744140625, + "learning_rate": 5.989461854720052e-07, + "loss": 0.0885, + "num_tokens": 1377758819.0, + "reward": 2.5111608505249023, + "reward_std": 0.4281720519065857, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15274205803871155, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 970.6451416015625, + "completions/mean_terminated_length": 781.18896484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5193117042246018, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12710555187313863, + "kl": 0.0274658203125, + "learning_rate": 5.986133921480463e-07, + "loss": 0.0973, + "num_tokens": 1378262020.0, + "reward": 2.357142925262451, + "reward_std": 0.48469728231430054, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.18097810447216034, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1054.5625, + "completions/mean_terminated_length": 818.552490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5195247988919077, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13336666132006755, + "kl": 0.027008056640625, + "learning_rate": 5.982805719202958e-07, + "loss": 0.1206, + "num_tokens": 1378798256.0, + "reward": 2.32421875, + "reward_std": 0.5027849674224854, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16989772021770477, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 974.888427734375, + "completions/mean_terminated_length": 815.2974853515625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5197378935592136, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13637060998037667, + "kl": 0.026458740234375, + "learning_rate": 5.979477249729442e-07, + "loss": 0.0872, + "num_tokens": 1379308398.0, + "reward": 2.4620537757873535, + "reward_std": 0.469237357378006, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 964.263427734375, + "completions/mean_terminated_length": 753.2960205078125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5199509882265196, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13622522182208557, + "kl": 0.028167724609375, + "learning_rate": 5.976148514901971e-07, + "loss": 0.0505, + "num_tokens": 1379805092.0, + "reward": 2.4129464626312256, + "reward_std": 0.429694265127182, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1043.19873046875, + "completions/mean_terminated_length": 801.0443115234375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5201640828938255, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11635314453987461, + "kl": 0.025115966796875, + "learning_rate": 5.972819516562743e-07, + "loss": 0.0869, + "num_tokens": 1380336829.0, + "reward": 2.385044813156128, + "reward_std": 0.46809110045433044, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.18929573893547058, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 998.4285888671875, + "completions/mean_terminated_length": 807.345703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5203771775611316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1350455071354124, + "kl": 0.027252197265625, + "learning_rate": 5.969490256554104e-07, + "loss": 0.1034, + "num_tokens": 1380856189.0, + "reward": 2.3292412757873535, + "reward_std": 0.48902902007102966, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14550481736660004, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1019.87060546875, + "completions/mean_terminated_length": 803.1297607421875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5205902722284375, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12401404166913708, + "kl": 0.02496337890625, + "learning_rate": 5.966160736718543e-07, + "loss": 0.0952, + "num_tokens": 1381375059.0, + "reward": 2.428013563156128, + "reward_std": 0.4561253488063812, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12540756165981293, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1068.47998046875, + "completions/mean_terminated_length": 818.79833984375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5208033668957435, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1256169204408851, + "kl": 0.025054931640625, + "learning_rate": 5.962830958898697e-07, + "loss": 0.0703, + "num_tokens": 1381921338.0, + "reward": 2.2377233505249023, + "reward_std": 0.4334922432899475, + "rewards/accuracy_reward/mean": 0.38461539149284363, + "rewards/accuracy_reward/std": 0.4870900511741638, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.16051718592643738, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 840.0267944335938, + "completions/mean_terminated_length": 684.8463134765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5210164615630494, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1458769375932124, + "kl": 0.032257080078125, + "learning_rate": 5.959500924937341e-07, + "loss": 0.0467, + "num_tokens": 1382357590.0, + "reward": 2.4642858505249023, + "reward_std": 0.45937252044677734, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14955390989780426, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1016.7366333007812, + "completions/mean_terminated_length": 785.6884765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5212295562303554, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13992978756785904, + "kl": 0.02557373046875, + "learning_rate": 5.956170636677396e-07, + "loss": 0.111, + "num_tokens": 1382879920.0, + "reward": 2.4419643878936768, + "reward_std": 0.45922961831092834, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14903545379638672, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1033.1607666015625, + "completions/mean_terminated_length": 802.3890380859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5214426508976613, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12265596561269321, + "kl": 0.0274658203125, + "learning_rate": 5.952840095961919e-07, + "loss": 0.0363, + "num_tokens": 1383411752.0, + "reward": 2.34375, + "reward_std": 0.4808152914047241, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14997069537639618, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 933.21435546875, + "completions/mean_terminated_length": 733.726318359375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5216557455649672, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15418116188495984, + "kl": 0.027130126953125, + "learning_rate": 5.949509304634113e-07, + "loss": 0.1096, + "num_tokens": 1383903720.0, + "reward": 2.407924175262451, + "reward_std": 0.4646216630935669, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14566238224506378, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 981.7567138671875, + "completions/mean_terminated_length": 770.7887573242188, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5218688402322732, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10868035069385899, + "kl": 0.028656005859375, + "learning_rate": 5.946178264537312e-07, + "loss": 0.0395, + "num_tokens": 1384408027.0, + "reward": 2.377232313156128, + "reward_std": 0.36640506982803345, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14433756470680237, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1012.9910888671875, + "completions/mean_terminated_length": 834.1675415039062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5220819348995791, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12215717705497343, + "kl": 0.02606201171875, + "learning_rate": 5.942846977514993e-07, + "loss": 0.044, + "num_tokens": 1384929735.0, + "reward": 2.4910714626312256, + "reward_std": 0.4684024751186371, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14088858664035797, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 958.9241333007812, + "completions/mean_terminated_length": 725.7615356445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5222950295668851, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1292888233626093, + "kl": 0.0296630859375, + "learning_rate": 5.939515445410772e-07, + "loss": 0.0758, + "num_tokens": 1385434197.0, + "reward": 2.302455425262451, + "reward_std": 0.440422385931015, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17634892463684082, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 928.76123046875, + "completions/mean_terminated_length": 772.1246948242188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.522508124234191, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14005163755878053, + "kl": 0.029022216796875, + "learning_rate": 5.936183670068391e-07, + "loss": 0.0588, + "num_tokens": 1385920394.0, + "reward": 2.4598214626312256, + "reward_std": 0.45283758640289307, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14523428678512573, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 840.419677734375, + "completions/mean_terminated_length": 702.23876953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.522721218901497, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14143409357067108, + "kl": 0.0313720703125, + "learning_rate": 5.932851653331738e-07, + "loss": 0.0464, + "num_tokens": 1386363126.0, + "reward": 2.493861675262451, + "reward_std": 0.3823241591453552, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1354389488697052, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 935.3660888671875, + "completions/mean_terminated_length": 746.537841796875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5229343135688029, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14328495436049649, + "kl": 0.0303955078125, + "learning_rate": 5.929519397044825e-07, + "loss": 0.0993, + "num_tokens": 1386850074.0, + "reward": 2.540736675262451, + "reward_std": 0.4627631902694702, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561823636293411, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 970.2857666015625, + "completions/mean_terminated_length": 728.83056640625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.523147408236109, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.21351183756019135, + "kl": 0.0328369140625, + "learning_rate": 5.926186903051804e-07, + "loss": 0.141, + "num_tokens": 1387360138.0, + "reward": 2.4503350257873535, + "reward_std": 0.4587583839893341, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1579248607158661, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 989.7388916015625, + "completions/mean_terminated_length": 741.9366455078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5233605029034148, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1227528978890609, + "kl": 0.02587890625, + "learning_rate": 5.922854173196953e-07, + "loss": 0.0493, + "num_tokens": 1387873557.0, + "reward": 2.3504464626312256, + "reward_std": 0.4151289165019989, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12083587795495987, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1056.1295166015625, + "completions/mean_terminated_length": 796.2872924804688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5235735975707208, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12923648512889652, + "kl": 0.023681640625, + "learning_rate": 5.919521209324684e-07, + "loss": 0.0735, + "num_tokens": 1388416607.0, + "reward": 2.3510046005249023, + "reward_std": 0.37309274077415466, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15014436841011047, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1010.8973388671875, + "completions/mean_terminated_length": 822.08447265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5237866922380268, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.11451177074598225, + "kl": 0.026275634765625, + "learning_rate": 5.916188013279536e-07, + "loss": 0.0636, + "num_tokens": 1388944177.0, + "reward": 2.3002233505249023, + "reward_std": 0.31751203536987305, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10537806153297424, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 966.5402221679688, + "completions/mean_terminated_length": 769.6517333984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5239997869053327, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1460635703868407, + "kl": 0.030487060546875, + "learning_rate": 5.912854586906183e-07, + "loss": 0.0747, + "num_tokens": 1389441635.0, + "reward": 2.44140625, + "reward_std": 0.39540570974349976, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1489589959383011, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 982.74560546875, + "completions/mean_terminated_length": 736.9176025390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5242128815726387, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13459047144974556, + "kl": 0.027740478515625, + "learning_rate": 5.909520932049414e-07, + "loss": 0.04, + "num_tokens": 1389955233.0, + "reward": 2.467076063156128, + "reward_std": 0.40677663683891296, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11601705849170685, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 853.3192138671875, + "completions/mean_terminated_length": 657.8259887695312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5244259762399446, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1495937155902843, + "kl": 0.0316162109375, + "learning_rate": 5.906187050554156e-07, + "loss": 0.1287, + "num_tokens": 1390403264.0, + "reward": 2.38671875, + "reward_std": 0.38554078340530396, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15291257202625275, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 910.1004638671875, + "completions/mean_terminated_length": 740.8743896484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5246390709072506, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1500674984180217, + "kl": 0.03179931640625, + "learning_rate": 5.902852944265456e-07, + "loss": 0.0894, + "num_tokens": 1390877981.0, + "reward": 2.470424175262451, + "reward_std": 0.48989835381507874, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18459028005599976, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 877.5022583007812, + "completions/mean_terminated_length": 689.4948120117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5248521655745565, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13166518872955504, + "kl": 0.029693603515625, + "learning_rate": 5.899518615028489e-07, + "loss": 0.0941, + "num_tokens": 1391334686.0, + "reward": 2.4207589626312256, + "reward_std": 0.4612061083316803, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 878.9308471679688, + "completions/mean_terminated_length": 738.6424560546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5250652602418624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2578161802889121, + "kl": 0.031951904296875, + "learning_rate": 5.896184064688549e-07, + "loss": 0.0979, + "num_tokens": 1391801103.0, + "reward": 2.4754464626312256, + "reward_std": 0.46477192640304565, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14040927588939667, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 905.77685546875, + "completions/mean_terminated_length": 762.2813720703125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5252783549091684, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13054103728060573, + "kl": 0.030548095703125, + "learning_rate": 5.892849295091053e-07, + "loss": 0.061, + "num_tokens": 1392269291.0, + "reward": 2.5440850257873535, + "reward_std": 0.4051132798194885, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.07627084106206894, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 998.6027221679688, + "completions/mean_terminated_length": 766.9918212890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5254914495764743, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1226467384994295, + "kl": 0.0264892578125, + "learning_rate": 5.889514308081542e-07, + "loss": 0.0469, + "num_tokens": 1392786713.0, + "reward": 2.286830425262451, + "reward_std": 0.3710114657878876, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14409084618091583, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 827.1205444335938, + "completions/mean_terminated_length": 720.4417724609375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.5257045442437803, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13635280668833064, + "kl": 0.03424072265625, + "learning_rate": 5.886179105505677e-07, + "loss": 0.0737, + "num_tokens": 1393221439.0, + "reward": 2.5580358505249023, + "reward_std": 0.44701412320137024, + "rewards/accuracy_reward/mean": 0.6696428656578064, + "rewards/accuracy_reward/std": 0.47086748480796814, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13989263772964478, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 979.7723388671875, + "completions/mean_terminated_length": 808.1917114257812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5259176389110862, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13830309111282307, + "kl": 0.02899169921875, + "learning_rate": 5.882843689209237e-07, + "loss": 0.0644, + "num_tokens": 1393728201.0, + "reward": 2.4402902126312256, + "reward_std": 0.4698637127876282, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1514935940504074, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 881.0045166015625, + "completions/mean_terminated_length": 721.0609130859375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5261307335783922, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13348446812520479, + "kl": 0.030242919921875, + "learning_rate": 5.879508061038119e-07, + "loss": 0.0604, + "num_tokens": 1394194331.0, + "reward": 2.5089287757873535, + "reward_std": 0.3965337574481964, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1282729059457779, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 940.3348388671875, + "completions/mean_terminated_length": 831.740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5263438282456981, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13311364045332552, + "kl": 0.029022216796875, + "learning_rate": 5.876172222838339e-07, + "loss": 0.0764, + "num_tokens": 1394685569.0, + "reward": 2.55078125, + "reward_std": 0.4826606512069702, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15343420207500458, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1140.5960693359375, + "completions/mean_terminated_length": 852.3617553710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5265569229130042, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12436721430394897, + "kl": 0.023406982421875, + "learning_rate": 5.872836176456025e-07, + "loss": 0.071, + "num_tokens": 1395267532.0, + "reward": 2.3091518878936768, + "reward_std": 0.4215041697025299, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1479213982820511, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 931.6942138671875, + "completions/mean_terminated_length": 768.9590454101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5267700175803101, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14306292684992372, + "kl": 0.02886962890625, + "learning_rate": 5.869499923737427e-07, + "loss": 0.0963, + "num_tokens": 1395756387.0, + "reward": 2.4715402126312256, + "reward_std": 0.44619134068489075, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15107274055480957, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1059.0379638671875, + "completions/mean_terminated_length": 789.321044921875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.526983112247616, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11203685607425817, + "kl": 0.026275634765625, + "learning_rate": 5.866163466528903e-07, + "loss": 0.081, + "num_tokens": 1396299780.0, + "reward": 2.4174108505249023, + "reward_std": 0.4012661874294281, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16431809961795807, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 904.40185546875, + "completions/mean_terminated_length": 770.3641357421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.527196206914922, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15022492914136928, + "kl": 0.033355712890625, + "learning_rate": 5.86282680667693e-07, + "loss": 0.0719, + "num_tokens": 1396763128.0, + "reward": 2.5580358505249023, + "reward_std": 0.3992389440536499, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12511979043483734, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1177.0848388671875, + "completions/mean_terminated_length": 883.3134155273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5274093015822279, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12062285629686431, + "kl": 0.022613525390625, + "learning_rate": 5.859489946028088e-07, + "loss": 0.0556, + "num_tokens": 1397362094.0, + "reward": 2.275111675262451, + "reward_std": 0.435001403093338, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.16169020533561707, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 955.1920166015625, + "completions/mean_terminated_length": 749.3845825195312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5276223962495339, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14135841599502916, + "kl": 0.031890869140625, + "learning_rate": 5.856152886429081e-07, + "loss": 0.0588, + "num_tokens": 1397859540.0, + "reward": 2.506138563156128, + "reward_std": 0.3933805823326111, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12381463497877121, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 922.9531860351562, + "completions/mean_terminated_length": 700.3502807617188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5278354909168398, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14999812773451962, + "kl": 0.02862548828125, + "learning_rate": 5.85281562972671e-07, + "loss": 0.1251, + "num_tokens": 1398347551.0, + "reward": 2.4988839626312256, + "reward_std": 0.4405926764011383, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16029927134513855, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1084.2723388671875, + "completions/mean_terminated_length": 831.8027954101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5280485855841458, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11600180998752851, + "kl": 0.024932861328125, + "learning_rate": 5.849478177767894e-07, + "loss": 0.0435, + "num_tokens": 1398903833.0, + "reward": 2.3560268878936768, + "reward_std": 0.4728037416934967, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1573437601327896, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 944.0870971679688, + "completions/mean_terminated_length": 749.9606323242188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.5282616802514517, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15122149061124673, + "kl": 0.028472900390625, + "learning_rate": 5.846140532399657e-07, + "loss": 0.0984, + "num_tokens": 1399387616.0, + "reward": 2.3660714626312256, + "reward_std": 0.4789182245731354, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13636787235736847, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1008.1116333007812, + "completions/mean_terminated_length": 802.3582763671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5284747749187576, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13843920547244937, + "kl": 0.02508544921875, + "learning_rate": 5.842802695469131e-07, + "loss": 0.0952, + "num_tokens": 1399911906.0, + "reward": 2.388392925262451, + "reward_std": 0.4354220926761627, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1066.7388916015625, + "completions/mean_terminated_length": 853.4212036132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5286878695860636, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15343738770341944, + "kl": 0.027252197265625, + "learning_rate": 5.839464668823552e-07, + "loss": 0.1211, + "num_tokens": 1400466829.0, + "reward": 2.36328125, + "reward_std": 0.49735790491104126, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.18215985596179962, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1119.700927734375, + "completions/mean_terminated_length": 866.5284423828125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5289009642533695, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12748076230154207, + "kl": 0.0242919921875, + "learning_rate": 5.836126454310263e-07, + "loss": 0.0493, + "num_tokens": 1401039079.0, + "reward": 2.3989956378936768, + "reward_std": 0.4652915298938751, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 992.8906860351562, + "completions/mean_terminated_length": 780.7373046875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5291140589206755, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 4.439852843399013, + "kl": 0.06658935546875, + "learning_rate": 5.832788053776708e-07, + "loss": 0.0883, + "num_tokens": 1401558470.0, + "reward": 2.3543527126312256, + "reward_std": 0.4658573269844055, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.8794642686843872, + "rewards/format_reward/std": 0.3259509205818176, + "rewards/tag_count_reward/mean": 0.9190848469734192, + "rewards/tag_count_reward/std": 0.24861730635166168, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 992.8170166015625, + "completions/mean_terminated_length": 756.4097900390625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.5293271535879814, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1315104057846089, + "kl": 0.029083251953125, + "learning_rate": 5.829449469070441e-07, + "loss": 0.0998, + "num_tokens": 1402073764.0, + "reward": 2.3504464626312256, + "reward_std": 0.34892064332962036, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1576172560453415, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 898.5469360351562, + "completions/mean_terminated_length": 737.6819458007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5295402482552874, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12191515970847414, + "kl": 0.029998779296875, + "learning_rate": 5.826110702039108e-07, + "loss": 0.0693, + "num_tokens": 1402543129.0, + "reward": 2.408482313156128, + "reward_std": 0.4478207528591156, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067808747291565, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 967.122802734375, + "completions/mean_terminated_length": 790.251953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5297533429225934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14887562414092487, + "kl": 0.03009033203125, + "learning_rate": 5.822771754530463e-07, + "loss": 0.0744, + "num_tokens": 1403051040.0, + "reward": 2.4107143878936768, + "reward_std": 0.4643322825431824, + "rewards/accuracy_reward/mean": 0.5555555820465088, + "rewards/accuracy_reward/std": 0.4974800944328308, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 980.7991333007812, + "completions/mean_terminated_length": 738.1205444335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5299664375898994, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1335019217940142, + "kl": 0.026947021484375, + "learning_rate": 5.819432628392358e-07, + "loss": 0.0998, + "num_tokens": 1403568678.0, + "reward": 2.3002233505249023, + "reward_std": 0.4801989197731018, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18866156041622162, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1063.82373046875, + "completions/mean_terminated_length": 833.369140625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5301795322572053, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1178279770259059, + "kl": 0.023895263671875, + "learning_rate": 5.816093325472744e-07, + "loss": 0.0539, + "num_tokens": 1404112359.0, + "reward": 2.33984375, + "reward_std": 0.35095176100730896, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14921022951602936, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 886.732177734375, + "completions/mean_terminated_length": 734.242431640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5303926269245112, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14542665018048415, + "kl": 0.030853271484375, + "learning_rate": 5.81275384761967e-07, + "loss": 0.0727, + "num_tokens": 1404570863.0, + "reward": 2.4966518878936768, + "reward_std": 0.3693641126155853, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14505796134471893, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 972.1295166015625, + "completions/mean_terminated_length": 799.3212280273438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5306057215918172, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11472671389309784, + "kl": 0.027313232421875, + "learning_rate": 5.809414196681281e-07, + "loss": 0.0085, + "num_tokens": 1405081513.0, + "reward": 2.439174175262451, + "reward_std": 0.32061702013015747, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.07479990273714066, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 953.2031860351562, + "completions/mean_terminated_length": 777.3549194335938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5308188162591231, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12740261748511683, + "kl": 0.027496337890625, + "learning_rate": 5.806074374505815e-07, + "loss": 0.0398, + "num_tokens": 1405574644.0, + "reward": 2.51171875, + "reward_std": 0.4446275234222412, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12265980988740921, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 986.2656860351562, + "completions/mean_terminated_length": 762.4405517578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5310319109264291, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1367710784006684, + "kl": 0.028106689453125, + "learning_rate": 5.802734382941612e-07, + "loss": 0.1059, + "num_tokens": 1406078715.0, + "reward": 2.3872768878936768, + "reward_std": 0.4167911410331726, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.1822258085012436, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 874.9866333007812, + "completions/mean_terminated_length": 693.5927734375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.531245005593735, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13320480664421974, + "kl": 0.0308837890625, + "learning_rate": 5.799394223837101e-07, + "loss": 0.0668, + "num_tokens": 1406537397.0, + "reward": 2.5357143878936768, + "reward_std": 0.3654972016811371, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11652304977178574, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1083.453125, + "completions/mean_terminated_length": 777.0676879882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.531458100261041, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11942950610744035, + "kl": 0.0260009765625, + "learning_rate": 5.796053899040804e-07, + "loss": 0.0575, + "num_tokens": 1407086608.0, + "reward": 2.385044813156128, + "reward_std": 0.41878095269203186, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.146973118185997, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 985.3170166015625, + "completions/mean_terminated_length": 768.2096557617188, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5316711949283469, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12442838341438026, + "kl": 0.027313232421875, + "learning_rate": 5.792713410401335e-07, + "loss": 0.0644, + "num_tokens": 1407601294.0, + "reward": 2.353236675262451, + "reward_std": 0.4003566801548004, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13748814165592194, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 932.1897583007812, + "completions/mean_terminated_length": 722.0504150390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5318842895956528, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15871092263141104, + "kl": 0.031158447265625, + "learning_rate": 5.789372759767398e-07, + "loss": 0.0622, + "num_tokens": 1408088611.0, + "reward": 2.4190850257873535, + "reward_std": 0.4261649250984192, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992804229259491, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.150824636220932, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1032.6295166015625, + "completions/mean_terminated_length": 811.896728515625, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.5320973842629588, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13023573129032154, + "kl": 0.02508544921875, + "learning_rate": 5.786031948987787e-07, + "loss": 0.0744, + "num_tokens": 1408622525.0, + "reward": 2.415736675262451, + "reward_std": 0.46487846970558167, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15423759818077087, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 937.4688110351562, + "completions/mean_terminated_length": 748.9973754882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5323104789302647, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13263306961502633, + "kl": 0.029449462890625, + "learning_rate": 5.782690979911387e-07, + "loss": 0.0818, + "num_tokens": 1409114831.0, + "reward": 2.4369421005249023, + "reward_std": 0.4531923234462738, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16174425184726715, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 929.8951416015625, + "completions/mean_terminated_length": 779.8709106445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5325235735975707, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.19656560004697857, + "kl": 0.03192138671875, + "learning_rate": 5.779349854387169e-07, + "loss": 0.0683, + "num_tokens": 1409607824.0, + "reward": 2.5558037757873535, + "reward_std": 0.4403499364852905, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15902084112167358, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 910.1920166015625, + "completions/mean_terminated_length": 767.251220703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5327366682648766, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12330147726368776, + "kl": 0.030487060546875, + "learning_rate": 5.776008574264189e-07, + "loss": 0.0826, + "num_tokens": 1410089926.0, + "reward": 2.4854912757873535, + "reward_std": 0.3864891231060028, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.10307835787534714, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 965.4420166015625, + "completions/mean_terminated_length": 794.8062133789062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5329497629321827, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13715560181129868, + "kl": 0.02630615234375, + "learning_rate": 5.772667141391589e-07, + "loss": 0.0905, + "num_tokens": 1410587724.0, + "reward": 2.3247768878936768, + "reward_std": 0.40243247151374817, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15571676194667816, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 951.8839721679688, + "completions/mean_terminated_length": 779.1111450195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5331628575994886, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1389484203241223, + "kl": 0.028961181640625, + "learning_rate": 5.769325557618595e-07, + "loss": 0.1085, + "num_tokens": 1411075192.0, + "reward": 2.4693081378936768, + "reward_std": 0.4963549077510834, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15941192209720612, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 985.5000610351562, + "completions/mean_terminated_length": 743.8904418945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5333759522667946, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.38685527033596045, + "kl": 0.0513916015625, + "learning_rate": 5.765983824794523e-07, + "loss": 0.0954, + "num_tokens": 1411587432.0, + "reward": 2.3013393878936768, + "reward_std": 0.38494160771369934, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11967316269874573, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 895.2745971679688, + "completions/mean_terminated_length": 717.0180053710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5335890469341005, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16634039395593334, + "kl": 0.032623291015625, + "learning_rate": 5.762641944768763e-07, + "loss": 0.1091, + "num_tokens": 1412054099.0, + "reward": 2.447544813156128, + "reward_std": 0.5318894386291504, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16929873824119568, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 946.6183471679688, + "completions/mean_terminated_length": 773.0155029296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5338021416014064, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1309891704087857, + "kl": 0.028106689453125, + "learning_rate": 5.759299919390788e-07, + "loss": 0.0845, + "num_tokens": 1412547960.0, + "reward": 2.334263563156128, + "reward_std": 0.41789138317108154, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13359208405017853, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1089.1429443359375, + "completions/mean_terminated_length": 810.0518188476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5340152362687124, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13884567732794723, + "kl": 0.023712158203125, + "learning_rate": 5.755957750510157e-07, + "loss": 0.0687, + "num_tokens": 1413111000.0, + "reward": 2.3666296005249023, + "reward_std": 0.38791385293006897, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.12189408391714096, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1011.9285888671875, + "completions/mean_terminated_length": 793.5135498046875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5342283309360183, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1213774906017427, + "kl": 0.027923583984375, + "learning_rate": 5.752615439976504e-07, + "loss": 0.0621, + "num_tokens": 1413633480.0, + "reward": 2.3565850257873535, + "reward_std": 0.4395703077316284, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13669590651988983, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1020.747802734375, + "completions/mean_terminated_length": 820.7760009765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5344414256033243, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13448910255368723, + "kl": 0.028778076171875, + "learning_rate": 5.749272989639539e-07, + "loss": 0.0558, + "num_tokens": 1414154583.0, + "reward": 2.3643975257873535, + "reward_std": 0.43220189213752747, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.1611565798521042, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1006.66748046875, + "completions/mean_terminated_length": 748.509765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5346545202706302, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1304327197273908, + "kl": 0.027557373046875, + "learning_rate": 5.745930401349054e-07, + "loss": 0.1002, + "num_tokens": 1414676130.0, + "reward": 2.35546875, + "reward_std": 0.48487091064453125, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.1728263646364212, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 907.82373046875, + "completions/mean_terminated_length": 758.103515625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.5348676149379362, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12486998668341937, + "kl": 0.030303955078125, + "learning_rate": 5.742587676954919e-07, + "loss": 0.0173, + "num_tokens": 1415152723.0, + "reward": 2.4190850257873535, + "reward_std": 0.3946661353111267, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.1061379685997963, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1021.2098388671875, + "completions/mean_terminated_length": 794.5885620117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5350807096052421, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1453946247051401, + "kl": 0.024383544921875, + "learning_rate": 5.739244818307069e-07, + "loss": 0.0792, + "num_tokens": 1415680065.0, + "reward": 2.439732313156128, + "reward_std": 0.5073918700218201, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16487935185432434, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 991.10498046875, + "completions/mean_terminated_length": 801.9763793945312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5352938042725481, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.3133943003002054, + "kl": 0.02825927734375, + "learning_rate": 5.735901827255529e-07, + "loss": 0.0875, + "num_tokens": 1416199744.0, + "reward": 2.3738839626312256, + "reward_std": 0.4831084609031677, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.1648755669593811, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 970.3370971679688, + "completions/mean_terminated_length": 760.552001953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.535506898939854, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12839339265974306, + "kl": 0.02838134765625, + "learning_rate": 5.732558705650383e-07, + "loss": 0.0244, + "num_tokens": 1416706727.0, + "reward": 2.4525671005249023, + "reward_std": 0.4027446508407593, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1258348822593689, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1109.993408203125, + "completions/mean_terminated_length": 836.9711303710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5357199936071599, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13708669443582794, + "kl": 0.027130126953125, + "learning_rate": 5.729215455341794e-07, + "loss": 0.1028, + "num_tokens": 1417279876.0, + "reward": 2.408482313156128, + "reward_std": 0.5162177681922913, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.1378791779279709, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 872.9732666015625, + "completions/mean_terminated_length": 731.969970703125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.535933088274466, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.21082023499587169, + "kl": 0.03302001953125, + "learning_rate": 5.725872078179995e-07, + "loss": 0.1104, + "num_tokens": 1417738520.0, + "reward": 2.5306921005249023, + "reward_std": 0.39250054955482483, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14566238224506378, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 917.732177734375, + "completions/mean_terminated_length": 749.6410522460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5361461829417719, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.20510272831215312, + "kl": 0.037139892578125, + "learning_rate": 5.722528576015291e-07, + "loss": 0.1151, + "num_tokens": 1418221248.0, + "reward": 2.454799175262451, + "reward_std": 0.4593575596809387, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12870889902114868, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 938.1183471679688, + "completions/mean_terminated_length": 749.7572021484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5363592776090779, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1375264190704001, + "kl": 0.0279541015625, + "learning_rate": 5.719184950698053e-07, + "loss": 0.0741, + "num_tokens": 1418710645.0, + "reward": 2.4849331378936768, + "reward_std": 0.38200175762176514, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493200302124, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11638233810663223, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1036.01123046875, + "completions/mean_terminated_length": 778.05322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5365723722763838, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12382754785966851, + "kl": 0.027618408203125, + "learning_rate": 5.71584120407872e-07, + "loss": 0.0846, + "num_tokens": 1419237770.0, + "reward": 2.3738839626312256, + "reward_std": 0.439721941947937, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1573437601327896, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 854.5982666015625, + "completions/mean_terminated_length": 694.4708862304688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5367854669436898, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1404310411286148, + "kl": 0.03167724609375, + "learning_rate": 5.712497338007803e-07, + "loss": 0.0298, + "num_tokens": 1419682262.0, + "reward": 2.3074777126312256, + "reward_std": 0.3961949050426483, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.8727678656578064, + "rewards/format_reward/std": 0.3336053788661957, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18908046185970306, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1084.399658203125, + "completions/mean_terminated_length": 858.7631225585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5369985616109957, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12986210209506627, + "kl": 0.02398681640625, + "learning_rate": 5.709153354335875e-07, + "loss": 0.0795, + "num_tokens": 1420239897.0, + "reward": 2.404017925262451, + "reward_std": 0.47819194197654724, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14433756470680237, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1043.734375, + "completions/mean_terminated_length": 794.7659912109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5372116562783016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14698619176495026, + "kl": 0.026458740234375, + "learning_rate": 5.705809254913576e-07, + "loss": 0.0623, + "num_tokens": 1420780786.0, + "reward": 2.466517925262451, + "reward_std": 0.4764553904533386, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 976.9910888671875, + "completions/mean_terminated_length": 715.1889038085938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5374247509456076, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12205023232259284, + "kl": 0.03057861328125, + "learning_rate": 5.702465041591605e-07, + "loss": 0.0599, + "num_tokens": 1421285022.0, + "reward": 2.4676339626312256, + "reward_std": 0.4549930691719055, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15208269655704498, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 971.6964721679688, + "completions/mean_terminated_length": 734.1471557617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5376378456129135, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13106855167259127, + "kl": 0.02789306640625, + "learning_rate": 5.699120716220734e-07, + "loss": 0.0645, + "num_tokens": 1421784230.0, + "reward": 2.5089287757873535, + "reward_std": 0.38742080330848694, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12672585248947144, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 955.2567138671875, + "completions/mean_terminated_length": 759.7131958007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5378509402802195, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13571287968399678, + "kl": 0.028289794921875, + "learning_rate": 5.695776280651785e-07, + "loss": 0.0988, + "num_tokens": 1422275385.0, + "reward": 2.375, + "reward_std": 0.4402559697628021, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.11756829917430878, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 956.732177734375, + "completions/mean_terminated_length": 771.530029296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5380640349475254, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1294028401861316, + "kl": 0.030426025390625, + "learning_rate": 5.692431736735653e-07, + "loss": 0.0852, + "num_tokens": 1422777185.0, + "reward": 2.4068081378936768, + "reward_std": 0.4592633545398712, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14827017486095428, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 966.5335083007812, + "completions/mean_terminated_length": 792.826416015625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5382771296148314, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11906959217067009, + "kl": 0.02728271484375, + "learning_rate": 5.689087086323281e-07, + "loss": 0.0204, + "num_tokens": 1423279728.0, + "reward": 2.4732143878936768, + "reward_std": 0.4347270131111145, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1540280133485794, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1162.80810546875, + "completions/mean_terminated_length": 878.1887817382812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5384902242821373, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1141979224608553, + "kl": 0.023040771484375, + "learning_rate": 5.685742331265682e-07, + "loss": 0.0847, + "num_tokens": 1423868090.0, + "reward": 2.177455425262451, + "reward_std": 0.4327274262905121, + "rewards/accuracy_reward/mean": 0.2946428656578064, + "rewards/accuracy_reward/std": 0.45639169216156006, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16713166236877441, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1052.6785888671875, + "completions/mean_terminated_length": 822.989013671875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.5387033189494433, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11984699791988482, + "kl": 0.026153564453125, + "learning_rate": 5.682397473413918e-07, + "loss": 0.0922, + "num_tokens": 1424405194.0, + "reward": 2.36328125, + "reward_std": 0.4711918234825134, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15252019464969635, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1029.310302734375, + "completions/mean_terminated_length": 804.476806640625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5389164136167492, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12966459816387496, + "kl": 0.0260009765625, + "learning_rate": 5.679052514619116e-07, + "loss": 0.078, + "num_tokens": 1424940149.0, + "reward": 2.303013563156128, + "reward_std": 0.44794052839279175, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.175029918551445, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 956.810302734375, + "completions/mean_terminated_length": 778.251953125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.5391295082840551, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12046376271080182, + "kl": 0.027435302734375, + "learning_rate": 5.675707456732451e-07, + "loss": 0.0731, + "num_tokens": 1425436016.0, + "reward": 2.4933037757873535, + "reward_std": 0.37164655327796936, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 967.0625610351562, + "completions/mean_terminated_length": 735.6422729492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5393426029513612, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13157964400801345, + "kl": 0.02764892578125, + "learning_rate": 5.672362301605159e-07, + "loss": 0.0574, + "num_tokens": 1425939740.0, + "reward": 2.446986675262451, + "reward_std": 0.41433101892471313, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13748814165592194, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1089.984375, + "completions/mean_terminated_length": 807.5635375976562, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5395556976186671, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13316268841935947, + "kl": 0.025665283203125, + "learning_rate": 5.669017051088526e-07, + "loss": 0.0839, + "num_tokens": 1426494133.0, + "reward": 2.314732313156128, + "reward_std": 0.40007105469703674, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1572524905204773, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 915.3438110351562, + "completions/mean_terminated_length": 753.5357055664062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5397687922859731, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12801515974999175, + "kl": 0.02911376953125, + "learning_rate": 5.665671707033892e-07, + "loss": 0.0738, + "num_tokens": 1426969071.0, + "reward": 2.431919813156128, + "reward_std": 0.4664011299610138, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14505796134471893, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 930.5982666015625, + "completions/mean_terminated_length": 741.0548706054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.539981886953279, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 1.3898405126493263, + "kl": 0.029632568359375, + "learning_rate": 5.662326271292649e-07, + "loss": 0.0772, + "num_tokens": 1427470523.0, + "reward": 2.3995537757873535, + "reward_std": 0.42330262064933777, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 922.7991333007812, + "completions/mean_terminated_length": 748.7989501953125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.540194981620585, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1219167225201827, + "kl": 0.02716064453125, + "learning_rate": 5.658980745716241e-07, + "loss": 0.0858, + "num_tokens": 1427951329.0, + "reward": 2.462611675262451, + "reward_std": 0.41727176308631897, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14825333654880524, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1041.90625, + "completions/mean_terminated_length": 778.3380126953125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5404080762878909, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11989181049298381, + "kl": 0.025115966796875, + "learning_rate": 5.655635132156159e-07, + "loss": 0.0759, + "num_tokens": 1428488407.0, + "reward": 2.3270089626312256, + "reward_std": 0.424572616815567, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1662929803133011, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1071.59375, + "completions/mean_terminated_length": 856.0926513671875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5406211709551968, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11491931818942336, + "kl": 0.025115966796875, + "learning_rate": 5.652289432463944e-07, + "loss": 0.0358, + "num_tokens": 1429041425.0, + "reward": 2.3705358505249023, + "reward_std": 0.4363420605659485, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13888955116271973, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1027.915283203125, + "completions/mean_terminated_length": 782.0775146484375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.5408342656225028, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1373722455798989, + "kl": 0.028717041015625, + "learning_rate": 5.648943648491184e-07, + "loss": 0.0627, + "num_tokens": 1429569467.0, + "reward": 2.2578125, + "reward_std": 0.5009745359420776, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15822990238666534, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1226.310302734375, + "completions/mean_terminated_length": 856.682861328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5410473602898087, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1193030656405214, + "kl": 0.024658203125, + "learning_rate": 5.645597782089517e-07, + "loss": 0.1012, + "num_tokens": 1430190854.0, + "reward": 2.2979912757873535, + "reward_std": 0.4786623418331146, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.16274166107177734, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1093.63623046875, + "completions/mean_terminated_length": 863.6370849609375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5412604549571147, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10841522890703569, + "kl": 0.023223876953125, + "learning_rate": 5.642251835110621e-07, + "loss": 0.0728, + "num_tokens": 1430754931.0, + "reward": 2.3314733505249023, + "reward_std": 0.38782799243927, + "rewards/accuracy_reward/mean": 0.4236111044883728, + "rewards/accuracy_reward/std": 0.49470317363739014, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 936.33935546875, + "completions/mean_terminated_length": 774.2813110351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5414735496244206, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13340307722619693, + "kl": 0.029937744140625, + "learning_rate": 5.638905809406222e-07, + "loss": 0.0992, + "num_tokens": 1431241675.0, + "reward": 2.377232313156128, + "reward_std": 0.4577689468860626, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13058780133724213, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1007.7589721679688, + "completions/mean_terminated_length": 815.1216430664062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5416866442917266, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12328841874383709, + "kl": 0.025299072265625, + "learning_rate": 5.635559706828093e-07, + "loss": 0.0867, + "num_tokens": 1431756543.0, + "reward": 2.3599331378936768, + "reward_std": 0.4778337776660919, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15551920235157013, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 901.0513916015625, + "completions/mean_terminated_length": 727.092529296875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5418997389590325, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1473653153540454, + "kl": 0.03179931640625, + "learning_rate": 5.632213529228038e-07, + "loss": 0.1122, + "num_tokens": 1432219062.0, + "reward": 2.564732313156128, + "reward_std": 0.5007344484329224, + "rewards/accuracy_reward/mean": 0.6919642686843872, + "rewards/accuracy_reward/std": 0.462197482585907, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 971.3750610351562, + "completions/mean_terminated_length": 754.8954467773438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5421128336263386, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13695829674396554, + "kl": 0.027191162109375, + "learning_rate": 5.628867278457918e-07, + "loss": 0.0469, + "num_tokens": 1432727310.0, + "reward": 2.35546875, + "reward_std": 0.4119955599308014, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14590215682983398, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 974.8995971679688, + "completions/mean_terminated_length": 748.6784057617188, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.5423259282936445, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1262898319965193, + "kl": 0.0294189453125, + "learning_rate": 5.625520956369622e-07, + "loss": 0.0746, + "num_tokens": 1433233777.0, + "reward": 2.439732313156128, + "reward_std": 0.35909226536750793, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12083587795495987, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1007.185302734375, + "completions/mean_terminated_length": 773.9972534179688, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.5425390229609504, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13804189209290182, + "kl": 0.028045654296875, + "learning_rate": 5.622174564815085e-07, + "loss": 0.0842, + "num_tokens": 1433755668.0, + "reward": 2.4090402126312256, + "reward_std": 0.39986953139305115, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15675851702690125, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1051.3929443359375, + "completions/mean_terminated_length": 811.2132568359375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.5427521176282564, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1355460551757965, + "kl": 0.024139404296875, + "learning_rate": 5.618828105646277e-07, + "loss": 0.0769, + "num_tokens": 1434296500.0, + "reward": 2.3777902126312256, + "reward_std": 0.4623502492904663, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12956929206848145, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 961.3839721679688, + "completions/mean_terminated_length": 749.8560180664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5429652122955623, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 2150.53406387813, + "kl": 105.60092163085938, + "learning_rate": 5.615481580715209e-07, + "loss": 4.3181, + "num_tokens": 1434796752.0, + "reward": 2.3744421005249023, + "reward_std": 0.42382705211639404, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15122967958450317, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 886.1004638671875, + "completions/mean_terminated_length": 706.4252319335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5431783069628683, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14041413351707122, + "kl": 0.029632568359375, + "learning_rate": 5.612134991873925e-07, + "loss": 0.0755, + "num_tokens": 1435261293.0, + "reward": 2.5033483505249023, + "reward_std": 0.4114845395088196, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11055814474821091, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1136.0982666015625, + "completions/mean_terminated_length": 842.890869140625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5433914016301742, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11273338321191699, + "kl": 0.0240478515625, + "learning_rate": 5.608788340974506e-07, + "loss": 0.073, + "num_tokens": 1435843113.0, + "reward": 2.25390625, + "reward_std": 0.468479722738266, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.1660102754831314, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1001.5692138671875, + "completions/mean_terminated_length": 780.9702758789062, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5436044962974802, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1375104936446683, + "kl": 0.029449462890625, + "learning_rate": 5.605441629869066e-07, + "loss": 0.1008, + "num_tokens": 1436357800.0, + "reward": 2.4464287757873535, + "reward_std": 0.4158378541469574, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13833114504814148, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1074.529052734375, + "completions/mean_terminated_length": 829.8016357421875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5438175909647861, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12140846818449216, + "kl": 0.0244140625, + "learning_rate": 5.602094860409758e-07, + "loss": 0.0958, + "num_tokens": 1436912869.0, + "reward": 2.3275671005249023, + "reward_std": 0.4429693818092346, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16174425184726715, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 980.6607666015625, + "completions/mean_terminated_length": 799.5195922851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.544030685632092, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13010839400105428, + "kl": 0.028778076171875, + "learning_rate": 5.598748034448758e-07, + "loss": 0.0877, + "num_tokens": 1437414717.0, + "reward": 2.53515625, + "reward_std": 0.46613243222236633, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14126798510551453, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1122.1138916015625, + "completions/mean_terminated_length": 849.1647338867188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.544243780299398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14159495696811322, + "kl": 0.023284912109375, + "learning_rate": 5.595401153838279e-07, + "loss": 0.0718, + "num_tokens": 1437994736.0, + "reward": 2.2963171005249023, + "reward_std": 0.4919103682041168, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14469929039478302, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1055.493408203125, + "completions/mean_terminated_length": 884.0131225585938, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5444568749667039, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13486247681306315, + "kl": 0.027191162109375, + "learning_rate": 5.592054220430563e-07, + "loss": 0.0822, + "num_tokens": 1438539565.0, + "reward": 2.4029018878936768, + "reward_std": 0.5142381191253662, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1728886514902115, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 975.7232666015625, + "completions/mean_terminated_length": 780.5066528320312, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.5446699696340099, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12594702108622743, + "kl": 0.028594970703125, + "learning_rate": 5.588707236077883e-07, + "loss": 0.0936, + "num_tokens": 1439046705.0, + "reward": 2.45703125, + "reward_std": 0.4303736090660095, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 980.7589721679688, + "completions/mean_terminated_length": 719.8778076171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.5448830643013158, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12738095522427592, + "kl": 0.02789306640625, + "learning_rate": 5.58536020263254e-07, + "loss": 0.0643, + "num_tokens": 1439554021.0, + "reward": 2.34375, + "reward_std": 0.46259820461273193, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.19045621156692505, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1004.5960083007812, + "completions/mean_terminated_length": 749.5416870117188, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5450961589686218, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13022701076071544, + "kl": 0.02978515625, + "learning_rate": 5.582013121946854e-07, + "loss": 0.1265, + "num_tokens": 1440083824.0, + "reward": 2.359375, + "reward_std": 0.5142177939414978, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.315234512090683, + "rewards/tag_count_reward/mean": 0.9397321343421936, + "rewards/tag_count_reward/std": 0.1906527429819107, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 926.9777221679688, + "completions/mean_terminated_length": 773.3350219726562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5453092536359277, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12026737910838516, + "kl": 0.027313232421875, + "learning_rate": 5.578665995873186e-07, + "loss": 0.0705, + "num_tokens": 1440566870.0, + "reward": 2.4503350257873535, + "reward_std": 0.38047823309898376, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14517304301261902, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1194.37060546875, + "completions/mean_terminated_length": 964.6401977539062, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.5455223483032338, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1238930668961791, + "kl": 0.024200439453125, + "learning_rate": 5.57531882626391e-07, + "loss": 0.0637, + "num_tokens": 1441176556.0, + "reward": 2.3080358505249023, + "reward_std": 0.5359495878219604, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18524597585201263, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1019.6272583007812, + "completions/mean_terminated_length": 812.8499145507812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5457354429705397, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13529000791082935, + "kl": 0.028045654296875, + "learning_rate": 5.571971614971429e-07, + "loss": 0.1024, + "num_tokens": 1441698005.0, + "reward": 2.3973214626312256, + "reward_std": 0.4668022692203522, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14383503794670105, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1010.3035888671875, + "completions/mean_terminated_length": 767.3168334960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5459485376378456, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11849813685526632, + "kl": 0.029266357421875, + "learning_rate": 5.568624363848166e-07, + "loss": 0.074, + "num_tokens": 1442223165.0, + "reward": 2.25390625, + "reward_std": 0.39514774084091187, + "rewards/accuracy_reward/mean": 0.3705357015132904, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1098.828125, + "completions/mean_terminated_length": 866.808349609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5461616323051516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12414305753363285, + "kl": 0.025146484375, + "learning_rate": 5.565277074746574e-07, + "loss": 0.0952, + "num_tokens": 1442782432.0, + "reward": 2.275669813156128, + "reward_std": 0.5509776473045349, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.1878126710653305, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1068.5067138671875, + "completions/mean_terminated_length": 787.0430908203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5463747269724575, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1301431766941673, + "kl": 0.02874755859375, + "learning_rate": 5.561929749519114e-07, + "loss": 0.0767, + "num_tokens": 1443332947.0, + "reward": 2.40234375, + "reward_std": 0.44990086555480957, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 856.0491333007812, + "completions/mean_terminated_length": 706.3065185546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5465878216397635, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1418762295400127, + "kl": 0.029876708984375, + "learning_rate": 5.558582390018282e-07, + "loss": 0.0778, + "num_tokens": 1443775513.0, + "reward": 2.5457589626312256, + "reward_std": 0.36205828189849854, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13083133101463318, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 961.2857666015625, + "completions/mean_terminated_length": 789.994873046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5468009163070694, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13569106289844665, + "kl": 0.029510498046875, + "learning_rate": 5.555234998096576e-07, + "loss": 0.0727, + "num_tokens": 1444271929.0, + "reward": 2.5150671005249023, + "reward_std": 0.3758191764354706, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11660738289356232, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1035.805908203125, + "completions/mean_terminated_length": 798.7906494140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5470140109743754, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1165296335371031, + "kl": 0.02587890625, + "learning_rate": 5.55188757560653e-07, + "loss": 0.0466, + "num_tokens": 1444808098.0, + "reward": 2.3839287757873535, + "reward_std": 0.3771636188030243, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1007.0558471679688, + "completions/mean_terminated_length": 784.1978759765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5472271056416813, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12479448805958689, + "kl": 0.0252685546875, + "learning_rate": 5.548540124400683e-07, + "loss": 0.0978, + "num_tokens": 1445326411.0, + "reward": 2.428013563156128, + "reward_std": 0.4119296967983246, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1208655834197998, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1168.419677734375, + "completions/mean_terminated_length": 882.1657104492188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5474402003089872, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12915338114329536, + "kl": 0.0235595703125, + "learning_rate": 5.54519264633159e-07, + "loss": 0.0897, + "num_tokens": 1445921511.0, + "reward": 2.189174175262451, + "reward_std": 0.44451647996902466, + "rewards/accuracy_reward/mean": 0.3147321343421936, + "rewards/accuracy_reward/std": 0.4649282991886139, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.17231273651123047, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1067.40185546875, + "completions/mean_terminated_length": 879.6276245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5476532949762932, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11660486962641035, + "kl": 0.02398681640625, + "learning_rate": 5.541845143251828e-07, + "loss": 0.0745, + "num_tokens": 1446472635.0, + "reward": 2.4112725257873535, + "reward_std": 0.43731269240379333, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16288256645202637, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1103.21435546875, + "completions/mean_terminated_length": 838.6742553710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5478663896435991, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11840129100858982, + "kl": 0.023468017578125, + "learning_rate": 5.538497617013983e-07, + "loss": 0.0534, + "num_tokens": 1447054363.0, + "reward": 2.30859375, + "reward_std": 0.4556789696216583, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.16595762968063354, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1004.6451416015625, + "completions/mean_terminated_length": 804.8536987304688, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.5480794843109051, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14144215124798631, + "kl": 0.027496337890625, + "learning_rate": 5.535150069470652e-07, + "loss": 0.0951, + "num_tokens": 1447574524.0, + "reward": 2.377232313156128, + "reward_std": 0.44333258271217346, + "rewards/accuracy_reward/mean": 0.5069444179534912, + "rewards/accuracy_reward/std": 0.5005314350128174, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14383503794670105, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1036.263427734375, + "completions/mean_terminated_length": 832.8311157226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.548292578978211, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13532153314087433, + "kl": 0.026092529296875, + "learning_rate": 5.531802502474449e-07, + "loss": 0.0817, + "num_tokens": 1448107938.0, + "reward": 2.318080425262451, + "reward_std": 0.5077657699584961, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960493743419647, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15610112249851227, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1060.1785888671875, + "completions/mean_terminated_length": 858.3656005859375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5485056736455171, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12089921734551706, + "kl": 0.024444580078125, + "learning_rate": 5.528454917877995e-07, + "loss": 0.0821, + "num_tokens": 1448655026.0, + "reward": 2.4793527126312256, + "reward_std": 0.5006406903266907, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14566238224506378, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1070.607177734375, + "completions/mean_terminated_length": 811.0734252929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.548718768312823, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13688520262825593, + "kl": 0.02423095703125, + "learning_rate": 5.525107317533922e-07, + "loss": 0.0883, + "num_tokens": 1449203522.0, + "reward": 2.40234375, + "reward_std": 0.449735164642334, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11317373067140579, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 976.7120971679688, + "completions/mean_terminated_length": 781.6754760742188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.548931862980129, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12134551453067012, + "kl": 0.027252197265625, + "learning_rate": 5.521759703294871e-07, + "loss": 0.0299, + "num_tokens": 1449709745.0, + "reward": 2.4994421005249023, + "reward_std": 0.4723702371120453, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13463464379310608, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1012.57373046875, + "completions/mean_terminated_length": 741.3211059570312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5491449576474349, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15234245257056114, + "kl": 0.027496337890625, + "learning_rate": 5.518412077013489e-07, + "loss": 0.0598, + "num_tokens": 1450238322.0, + "reward": 2.4564733505249023, + "reward_std": 0.42672479152679443, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15116052329540253, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 911.7031860351562, + "completions/mean_terminated_length": 701.2777709960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5493580523147408, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1445336646711588, + "kl": 0.02880859375, + "learning_rate": 5.515064440542433e-07, + "loss": 0.0767, + "num_tokens": 1450712221.0, + "reward": 2.4765625, + "reward_std": 0.40348440408706665, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559476852417, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1015.3058471679688, + "completions/mean_terminated_length": 810.9759521484375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5495711469820468, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12032728502922249, + "kl": 0.027740478515625, + "learning_rate": 5.51171679573436e-07, + "loss": 0.0877, + "num_tokens": 1451244470.0, + "reward": 2.5379464626312256, + "reward_std": 0.42562028765678406, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.15048591792583466, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1058.8616943359375, + "completions/mean_terminated_length": 827.2451782226562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5497842416493527, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11493643529843638, + "kl": 0.026611328125, + "learning_rate": 5.508369144441939e-07, + "loss": 0.0684, + "num_tokens": 1451792216.0, + "reward": 2.424107313156128, + "reward_std": 0.4627886116504669, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13989263772964478, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 902.0313110351562, + "completions/mean_terminated_length": 738.3214111328125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.5499973363166587, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1381902211240128, + "kl": 0.031005859375, + "learning_rate": 5.505021488517836e-07, + "loss": 0.0766, + "num_tokens": 1452265366.0, + "reward": 2.5401787757873535, + "reward_std": 0.455746591091156, + "rewards/accuracy_reward/mean": 0.6629464030265808, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12286446243524551, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 989.30810546875, + "completions/mean_terminated_length": 715.7135009765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5502104309839646, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12238713164235202, + "kl": 0.0269775390625, + "learning_rate": 5.501673829814725e-07, + "loss": 0.0695, + "num_tokens": 1452779120.0, + "reward": 2.3660714626312256, + "reward_std": 0.4281170070171356, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15813913941383362, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 921.8928833007812, + "completions/mean_terminated_length": 754.4205322265625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5504235256512706, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14369608573527254, + "kl": 0.029876708984375, + "learning_rate": 5.498326170185274e-07, + "loss": 0.1103, + "num_tokens": 1453259616.0, + "reward": 2.424107313156128, + "reward_std": 0.4437844753265381, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1168653815984726, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 971.1964721679688, + "completions/mean_terminated_length": 781.8372802734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5506366203185765, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12703118156672616, + "kl": 0.0264892578125, + "learning_rate": 5.494978511482165e-07, + "loss": 0.0723, + "num_tokens": 1453762264.0, + "reward": 2.529017925262451, + "reward_std": 0.41542086005210876, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12155692279338837, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1008.6920166015625, + "completions/mean_terminated_length": 786.184326171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5508497149858825, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12812316691405934, + "kl": 0.025390625, + "learning_rate": 5.491630855558062e-07, + "loss": 0.1043, + "num_tokens": 1454290974.0, + "reward": 2.4029018878936768, + "reward_std": 0.4100467264652252, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13607001304626465, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 856.7835083007812, + "completions/mean_terminated_length": 733.55419921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5510628096531884, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.20322389100892088, + "kl": 0.0384521484375, + "learning_rate": 5.488283204265642e-07, + "loss": 0.0712, + "num_tokens": 1454736029.0, + "reward": 2.4916296005249023, + "reward_std": 0.39276954531669617, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151521027088165, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 964.9888916015625, + "completions/mean_terminated_length": 747.2252197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5512759043204943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14209145193041803, + "kl": 0.03094482421875, + "learning_rate": 5.48493555945757e-07, + "loss": 0.0818, + "num_tokens": 1455236184.0, + "reward": 2.3895089626312256, + "reward_std": 0.4565965235233307, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12695714831352234, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1114.9866943359375, + "completions/mean_terminated_length": 836.434814453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5514889989878003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1324779333317211, + "kl": 0.02508544921875, + "learning_rate": 5.481587922986511e-07, + "loss": 0.1283, + "num_tokens": 1455812034.0, + "reward": 2.3314733505249023, + "reward_std": 0.5204752087593079, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1599874496459961, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1023.2656860351562, + "completions/mean_terminated_length": 765.6508178710938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5517020936551063, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 1.2567054797592596, + "kl": 0.163177490234375, + "learning_rate": 5.47824029670513e-07, + "loss": 0.072, + "num_tokens": 1456345289.0, + "reward": 2.294642925262451, + "reward_std": 0.39642608165740967, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 977.732177734375, + "completions/mean_terminated_length": 765.9679565429688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5519151883224123, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.4967322351541157, + "kl": 0.08843994140625, + "learning_rate": 5.474892682466078e-07, + "loss": 0.0844, + "num_tokens": 1456861937.0, + "reward": 2.4481027126312256, + "reward_std": 0.45657193660736084, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16229133307933807, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 954.919677734375, + "completions/mean_terminated_length": 789.131103515625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5521282829897182, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12856782879075673, + "kl": 0.027008056640625, + "learning_rate": 5.471545082122006e-07, + "loss": 0.0541, + "num_tokens": 1457358989.0, + "reward": 2.4598214626312256, + "reward_std": 0.4691341519355774, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.13043475151062012, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 857.935302734375, + "completions/mean_terminated_length": 721.7586669921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5523413776570242, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13654891552826218, + "kl": 0.03204345703125, + "learning_rate": 5.468197497525552e-07, + "loss": 0.0511, + "num_tokens": 1457812544.0, + "reward": 2.5636162757873535, + "reward_std": 0.3479025661945343, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1312885582447052, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 943.0647583007812, + "completions/mean_terminated_length": 788.4300537109375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5525544723243301, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14404815460757897, + "kl": 0.02978515625, + "learning_rate": 5.464849930529349e-07, + "loss": 0.1286, + "num_tokens": 1458303741.0, + "reward": 2.4916296005249023, + "reward_std": 0.4349367618560791, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13903217017650604, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 974.0692138671875, + "completions/mean_terminated_length": 768.4228515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.552767566991636, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14537344691061183, + "kl": 0.028778076171875, + "learning_rate": 5.461502382986018e-07, + "loss": 0.0999, + "num_tokens": 1458804380.0, + "reward": 2.4185268878936768, + "reward_std": 0.4646940529346466, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15339045226573944, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1093.9888916015625, + "completions/mean_terminated_length": 854.1536254882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.552980661658942, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14030659575507484, + "kl": 0.024139404296875, + "learning_rate": 5.458154856748172e-07, + "loss": 0.0673, + "num_tokens": 1459369063.0, + "reward": 2.310826063156128, + "reward_std": 0.530843198299408, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16573180258274078, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 948.794677734375, + "completions/mean_terminated_length": 768.9246826171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5531937563262479, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13749057975563722, + "kl": 0.02874755859375, + "learning_rate": 5.454807353668411e-07, + "loss": 0.0599, + "num_tokens": 1459862555.0, + "reward": 2.4659600257873535, + "reward_std": 0.38980361819267273, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12715734541416168, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 868.6361694335938, + "completions/mean_terminated_length": 693.2435913085938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5534068509935539, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1332445726900636, + "kl": 0.033447265625, + "learning_rate": 5.451459875599317e-07, + "loss": 0.0849, + "num_tokens": 1460318840.0, + "reward": 2.541294813156128, + "reward_std": 0.43197789788246155, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11709482222795486, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 896.2813110351562, + "completions/mean_terminated_length": 758.0750122070312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5536199456608598, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1343735938852152, + "kl": 0.0291748046875, + "learning_rate": 5.44811242439347e-07, + "loss": 0.0724, + "num_tokens": 1460785622.0, + "reward": 2.4296875, + "reward_std": 0.42164888978004456, + "rewards/accuracy_reward/mean": 0.5509259104728699, + "rewards/accuracy_reward/std": 0.49797651171684265, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12585102021694183, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 990.8281860351562, + "completions/mean_terminated_length": 778.2600708007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5538330403281658, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13371481214003675, + "kl": 0.026824951171875, + "learning_rate": 5.444765001903424e-07, + "loss": 0.1133, + "num_tokens": 1461305305.0, + "reward": 2.3677456378936768, + "reward_std": 0.43860164284706116, + "rewards/accuracy_reward/mean": 0.5046296119689941, + "rewards/accuracy_reward/std": 0.5005582571029663, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14989471435546875, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 963.6138916015625, + "completions/mean_terminated_length": 792.68994140625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.5540461349954717, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13496517540196407, + "kl": 0.0272216796875, + "learning_rate": 5.44141760998172e-07, + "loss": 0.0771, + "num_tokens": 1461803852.0, + "reward": 2.427455425262451, + "reward_std": 0.37594443559646606, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12177752703428268, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 969.46435546875, + "completions/mean_terminated_length": 756.064208984375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.5542592296627777, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12995735936192493, + "kl": 0.027374267578125, + "learning_rate": 5.438070250480887e-07, + "loss": 0.0668, + "num_tokens": 1462303708.0, + "reward": 2.4771206378936768, + "reward_std": 0.4451123774051666, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.152151420712471, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 962.2366333007812, + "completions/mean_terminated_length": 781.2760620117188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5544723243300836, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11735627642728928, + "kl": 0.027984619140625, + "learning_rate": 5.434722925253427e-07, + "loss": 0.0621, + "num_tokens": 1462804486.0, + "reward": 2.4447546005249023, + "reward_std": 0.4166436493396759, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12783296406269073, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 997.888427734375, + "completions/mean_terminated_length": 762.6174926757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5546854189973895, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13765768979067447, + "kl": 0.027313232421875, + "learning_rate": 5.431375636151833e-07, + "loss": 0.1218, + "num_tokens": 1463319332.0, + "reward": 2.3426339626312256, + "reward_std": 0.5067855715751648, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16460275650024414, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1075.1138916015625, + "completions/mean_terminated_length": 847.3030395507812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5548985136646956, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12972409661348047, + "kl": 0.025390625, + "learning_rate": 5.428028385028572e-07, + "loss": 0.0592, + "num_tokens": 1463871143.0, + "reward": 2.2767858505249023, + "reward_std": 0.4311223328113556, + "rewards/accuracy_reward/mean": 0.3705357015132904, + "rewards/accuracy_reward/std": 0.4834881126880646, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.1378791630268097, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1051.0357666015625, + "completions/mean_terminated_length": 817.5867919921875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5551116083320015, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12029987376112536, + "kl": 0.02606201171875, + "learning_rate": 5.42468117373609e-07, + "loss": 0.0492, + "num_tokens": 1464405447.0, + "reward": 2.4760046005249023, + "reward_std": 0.4058417081832886, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1046096533536911, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 874.2410888671875, + "completions/mean_terminated_length": 720.111083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5553247029993075, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.18953896596032319, + "kl": 0.03857421875, + "learning_rate": 5.421334004126814e-07, + "loss": 0.0674, + "num_tokens": 1464873171.0, + "reward": 2.4324777126312256, + "reward_std": 0.4040633738040924, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12015077471733093, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 857.2098388671875, + "completions/mean_terminated_length": 676.6015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5555377976666134, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14002389938369492, + "kl": 0.0338134765625, + "learning_rate": 5.417986878053145e-07, + "loss": 0.072, + "num_tokens": 1465323761.0, + "reward": 2.4732143878936768, + "reward_std": 0.4370638132095337, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 906.6629638671875, + "completions/mean_terminated_length": 746.933837890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5557508923339194, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13729975978229228, + "kl": 0.033843994140625, + "learning_rate": 5.414639797367462e-07, + "loss": 0.0774, + "num_tokens": 1465799994.0, + "reward": 2.46484375, + "reward_std": 0.38406994938850403, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1273927539587021, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1026.9085693359375, + "completions/mean_terminated_length": 741.0028686523438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5559639870012253, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12521871723959077, + "kl": 0.026275634765625, + "learning_rate": 5.411292763922115e-07, + "loss": 0.015, + "num_tokens": 1466340705.0, + "reward": 2.3878350257873535, + "reward_std": 0.35273706912994385, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11214316636323929, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 883.1763916015625, + "completions/mean_terminated_length": 749.8880615234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5561770816685312, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13786148696566777, + "kl": 0.030609130859375, + "learning_rate": 5.407945779569437e-07, + "loss": 0.0956, + "num_tokens": 1466805696.0, + "reward": 2.4034600257873535, + "reward_std": 0.36062392592430115, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1070.52685546875, + "completions/mean_terminated_length": 803.9431762695312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5563901763358372, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13069413153063825, + "kl": 0.025604248046875, + "learning_rate": 5.404598846161722e-07, + "loss": 0.0844, + "num_tokens": 1467355820.0, + "reward": 2.2885046005249023, + "reward_std": 0.3684141933917999, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11517468094825745, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1013.3660888671875, + "completions/mean_terminated_length": 781.5628051757812, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5566032710031431, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15154086931701127, + "kl": 0.028900146484375, + "learning_rate": 5.401251965551243e-07, + "loss": 0.1087, + "num_tokens": 1467886624.0, + "reward": 2.3214287757873535, + "reward_std": 0.5204406976699829, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.8705357313156128, + "rewards/format_reward/std": 0.3360883891582489, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.18038389086723328, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 886.4285888671875, + "completions/mean_terminated_length": 717.0946044921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5568163656704491, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1384663688452525, + "kl": 0.033721923828125, + "learning_rate": 5.397905139590243e-07, + "loss": 0.0376, + "num_tokens": 1468353136.0, + "reward": 2.4966518878936768, + "reward_std": 0.46916860342025757, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559476852417, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 995.7723388671875, + "completions/mean_terminated_length": 790.9386596679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.557029460337755, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12180172590545481, + "kl": 0.02587890625, + "learning_rate": 5.394558370130933e-07, + "loss": 0.0596, + "num_tokens": 1468871386.0, + "reward": 2.3286831378936768, + "reward_std": 0.3538344204425812, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174957811832428, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561822891235352, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 993.8058471679688, + "completions/mean_terminated_length": 768.1111450195312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.557242555005061, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12623495953964037, + "kl": 0.029510498046875, + "learning_rate": 5.391211659025495e-07, + "loss": 0.0694, + "num_tokens": 1469386195.0, + "reward": 2.4207589626312256, + "reward_std": 0.42882055044174194, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12805373966693878, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 846.0982666015625, + "completions/mean_terminated_length": 681.3705444335938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5574556496723669, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14415830208165778, + "kl": 0.03057861328125, + "learning_rate": 5.387865008126077e-07, + "loss": 0.0034, + "num_tokens": 1469828367.0, + "reward": 2.43359375, + "reward_std": 0.4682982563972473, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14052370190620422, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1030.9598388671875, + "completions/mean_terminated_length": 799.6876831054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.557668744339673, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12576256610199696, + "kl": 0.029083251953125, + "learning_rate": 5.384518419284791e-07, + "loss": 0.0854, + "num_tokens": 1470358717.0, + "reward": 2.3588171005249023, + "reward_std": 0.4639336168766022, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16087746620178223, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 922.872802734375, + "completions/mean_terminated_length": 787.8574829101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5578818390069789, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15170371406206454, + "kl": 0.028228759765625, + "learning_rate": 5.381171894353725e-07, + "loss": 0.0855, + "num_tokens": 1470842164.0, + "reward": 2.3738839626312256, + "reward_std": 0.423240065574646, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14311718940734863, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 987.9531860351562, + "completions/mean_terminated_length": 791.6481323242188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5580949336742848, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13135331044535922, + "kl": 0.029388427734375, + "learning_rate": 5.377825435184915e-07, + "loss": 0.094, + "num_tokens": 1471353263.0, + "reward": 2.46484375, + "reward_std": 0.4773964285850525, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16172881424427032, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 963.669677734375, + "completions/mean_terminated_length": 779.6448974609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5583080283415908, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12757267422164323, + "kl": 0.02850341796875, + "learning_rate": 5.37447904363038e-07, + "loss": 0.085, + "num_tokens": 1471850395.0, + "reward": 2.349888563156128, + "reward_std": 0.4307153820991516, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15628795325756073, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 951.2031860351562, + "completions/mean_terminated_length": 761.7042236328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5585211230088967, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12403225478166431, + "kl": 0.027679443359375, + "learning_rate": 5.371132721542084e-07, + "loss": 0.0562, + "num_tokens": 1472343990.0, + "reward": 2.4441964626312256, + "reward_std": 0.41627421975135803, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13376134634017944, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1004.5535888671875, + "completions/mean_terminated_length": 774.256103515625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5587342176762027, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14373088519809513, + "kl": 0.027008056640625, + "learning_rate": 5.367786470771962e-07, + "loss": 0.0864, + "num_tokens": 1472863694.0, + "reward": 2.3370537757873535, + "reward_std": 0.4103930592536926, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 984.7388916015625, + "completions/mean_terminated_length": 797.7611694335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5589473123435086, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11361073484305183, + "kl": 0.027099609375, + "learning_rate": 5.364440293171909e-07, + "loss": 0.0689, + "num_tokens": 1473373945.0, + "reward": 2.4799108505249023, + "reward_std": 0.399619460105896, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11244472116231918, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 981.6138916015625, + "completions/mean_terminated_length": 835.4593505859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5591604070108146, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12396197335137026, + "kl": 0.027862548828125, + "learning_rate": 5.361094190593777e-07, + "loss": 0.0614, + "num_tokens": 1473882316.0, + "reward": 2.459263563156128, + "reward_std": 0.47173482179641724, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 968.3370971679688, + "completions/mean_terminated_length": 758.1626586914062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5593735016781205, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12625989178642208, + "kl": 0.028839111328125, + "learning_rate": 5.35774816488938e-07, + "loss": 0.0839, + "num_tokens": 1474378611.0, + "reward": 2.4441964626312256, + "reward_std": 0.4054144322872162, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13731664419174194, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 979.9732666015625, + "completions/mean_terminated_length": 718.9000244140625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.5595865963454264, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12840186531315384, + "kl": 0.027801513671875, + "learning_rate": 5.354402217910483e-07, + "loss": 0.0974, + "num_tokens": 1474887527.0, + "reward": 2.4419643878936768, + "reward_std": 0.40829336643218994, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11244472116231918, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 939.3973388671875, + "completions/mean_terminated_length": 767.9638671875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.5597996910127324, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.6444096761549296, + "kl": 0.031646728515625, + "learning_rate": 5.351056351508816e-07, + "loss": 0.0831, + "num_tokens": 1475382601.0, + "reward": 2.388951063156128, + "reward_std": 0.46781638264656067, + "rewards/accuracy_reward/mean": 0.5324074029922485, + "rewards/accuracy_reward/std": 0.49952712655067444, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.146371990442276, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1135.79248046875, + "completions/mean_terminated_length": 893.5678100585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5600127856800383, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12710418831407644, + "kl": 0.023223876953125, + "learning_rate": 5.347710567536057e-07, + "loss": 0.0808, + "num_tokens": 1475965836.0, + "reward": 2.3582589626312256, + "reward_std": 0.42983826994895935, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16906259953975677, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 963.94873046875, + "completions/mean_terminated_length": 710.107421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5602258803473443, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13288478611508414, + "kl": 0.028656005859375, + "learning_rate": 5.344364867843841e-07, + "loss": 0.0931, + "num_tokens": 1476463781.0, + "reward": 2.3671875, + "reward_std": 0.3562367558479309, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14969991147518158, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1043.43310546875, + "completions/mean_terminated_length": 814.9972534179688, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5604389750146502, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11385585983768896, + "kl": 0.026763916015625, + "learning_rate": 5.34101925428376e-07, + "loss": 0.0431, + "num_tokens": 1476998135.0, + "reward": 2.357701063156128, + "reward_std": 0.331063449382782, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13748814165592194, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 989.1920166015625, + "completions/mean_terminated_length": 783.0773315429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5606520696819562, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12358929205034586, + "kl": 0.0283203125, + "learning_rate": 5.33767372870735e-07, + "loss": 0.0443, + "num_tokens": 1477515053.0, + "reward": 2.5027902126312256, + "reward_std": 0.4237518608570099, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15314915776252747, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1027.5625, + "completions/mean_terminated_length": 802.3433227539062, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.5608651643492621, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11991277810461956, + "kl": 0.027099609375, + "learning_rate": 5.334328292966108e-07, + "loss": 0.0976, + "num_tokens": 1478053193.0, + "reward": 2.474888563156128, + "reward_std": 0.38539546728134155, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.216333270072937, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12015077471733093, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1137.546875, + "completions/mean_terminated_length": 882.6199951171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5610782590165682, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12505912323811832, + "kl": 0.0263671875, + "learning_rate": 5.330982948911475e-07, + "loss": 0.0945, + "num_tokens": 1478637982.0, + "reward": 2.2918527126312256, + "reward_std": 0.4079464077949524, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1100.790283203125, + "completions/mean_terminated_length": 856.005615234375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5612913536838741, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1154762700102432, + "kl": 0.0245361328125, + "learning_rate": 5.327637698394842e-07, + "loss": 0.0654, + "num_tokens": 1479203792.0, + "reward": 2.404576063156128, + "reward_std": 0.451653391122818, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.1657317876815796, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 991.700927734375, + "completions/mean_terminated_length": 782.7005615234375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.56150444835118, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.134487181394703, + "kl": 0.030548095703125, + "learning_rate": 5.32429254326755e-07, + "loss": 0.0707, + "num_tokens": 1479717194.0, + "reward": 2.48046875, + "reward_std": 0.425826758146286, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15641570091247559, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1120.5379638671875, + "completions/mean_terminated_length": 893.8250122070312, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.561717543018486, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11188389292700854, + "kl": 0.0257568359375, + "learning_rate": 5.320947485380883e-07, + "loss": 0.0675, + "num_tokens": 1480289723.0, + "reward": 2.2935268878936768, + "reward_std": 0.4769846200942993, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17032796144485474, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 936.02685546875, + "completions/mean_terminated_length": 764.0721435546875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5619306376857919, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.28855193647952737, + "kl": 0.032989501953125, + "learning_rate": 5.317602526586082e-07, + "loss": 0.0892, + "num_tokens": 1480780599.0, + "reward": 2.5184152126312256, + "reward_std": 0.47805073857307434, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1524137556552887, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1002.7076416015625, + "completions/mean_terminated_length": 792.5281982421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5621437323530979, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13759937677542167, + "kl": 0.02496337890625, + "learning_rate": 5.314257668734318e-07, + "loss": 0.0781, + "num_tokens": 1481304612.0, + "reward": 2.431361675262451, + "reward_std": 0.33426597714424133, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1327671855688095, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1036.946533203125, + "completions/mean_terminated_length": 813.79833984375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5623568270204038, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12781642366808002, + "kl": 0.02801513671875, + "learning_rate": 5.310912913676721e-07, + "loss": 0.0691, + "num_tokens": 1481838332.0, + "reward": 2.353794813156128, + "reward_std": 0.4601910412311554, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14259286224842072, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 858.4330444335938, + "completions/mean_terminated_length": 698.8202514648438, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.5625699216877098, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1435897039478035, + "kl": 0.0328369140625, + "learning_rate": 5.307568263264349e-07, + "loss": 0.0824, + "num_tokens": 1482283166.0, + "reward": 2.4732143878936768, + "reward_std": 0.41550031304359436, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1012.9732666015625, + "completions/mean_terminated_length": 811.4879760742188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5627830163550157, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13128086621221471, + "kl": 0.02618408203125, + "learning_rate": 5.304223719348215e-07, + "loss": 0.0897, + "num_tokens": 1482803442.0, + "reward": 2.450892925262451, + "reward_std": 0.39558514952659607, + "rewards/accuracy_reward/mean": 0.5532407164573669, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13422717154026031, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1149.013427734375, + "completions/mean_terminated_length": 834.9096069335938, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.5629961110223217, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13171506506147648, + "kl": 0.0260009765625, + "learning_rate": 5.300879283779268e-07, + "loss": 0.1374, + "num_tokens": 1483385144.0, + "reward": 2.3119421005249023, + "reward_std": 0.4740091860294342, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16142748296260834, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 918.6317138671875, + "completions/mean_terminated_length": 720.0288696289062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5632092056896276, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11980488117574017, + "kl": 0.028472900390625, + "learning_rate": 5.297534958408394e-07, + "loss": 0.0837, + "num_tokens": 1483860579.0, + "reward": 2.3643975257873535, + "reward_std": 0.4041427671909332, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.13000212609767914, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1109.872802734375, + "completions/mean_terminated_length": 847.1971435546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5634223003569335, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10327414510349373, + "kl": 0.025238037109375, + "learning_rate": 5.294190745086426e-07, + "loss": 0.0299, + "num_tokens": 1484417754.0, + "reward": 2.255580425262451, + "reward_std": 0.4205310046672821, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1599874496459961, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1009.47998046875, + "completions/mean_terminated_length": 773.3233032226562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5636353950242395, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11110738201786279, + "kl": 0.028289794921875, + "learning_rate": 5.290846645664125e-07, + "loss": 0.0707, + "num_tokens": 1484932257.0, + "reward": 2.484375, + "reward_std": 0.373627632856369, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14480386674404144, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 971.9085083007812, + "completions/mean_terminated_length": 799.0647583007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5638484896915454, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12044673954833522, + "kl": 0.027191162109375, + "learning_rate": 5.287502661992197e-07, + "loss": 0.0644, + "num_tokens": 1485439784.0, + "reward": 2.5362725257873535, + "reward_std": 0.42643916606903076, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10724954307079315, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1066.075927734375, + "completions/mean_terminated_length": 765.4869384765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5640615843588515, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13605337010110938, + "kl": 0.02734375, + "learning_rate": 5.284158795921281e-07, + "loss": 0.0833, + "num_tokens": 1485984090.0, + "reward": 2.2611608505249023, + "reward_std": 0.36337438225746155, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14383503794670105, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1045.3504638671875, + "completions/mean_terminated_length": 800.2583618164062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5642746790261574, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1342160202590979, + "kl": 0.028778076171875, + "learning_rate": 5.280815049301949e-07, + "loss": 0.0668, + "num_tokens": 1486524439.0, + "reward": 2.4174108505249023, + "reward_std": 0.5006897449493408, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14903545379638672, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 991.5558471679688, + "completions/mean_terminated_length": 818.68310546875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5644877736934634, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11171373908048758, + "kl": 0.026885986328125, + "learning_rate": 5.277471423984709e-07, + "loss": 0.0426, + "num_tokens": 1487044176.0, + "reward": 2.3828125, + "reward_std": 0.3635784089565277, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.1118156760931015, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1209.109375, + "completions/mean_terminated_length": 895.168701171875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.5647008683607693, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1228380250720558, + "kl": 0.023193359375, + "learning_rate": 5.274127921820004e-07, + "loss": 0.0854, + "num_tokens": 1487659137.0, + "reward": 2.2779018878936768, + "reward_std": 0.48724567890167236, + "rewards/accuracy_reward/mean": 0.41898149251937866, + "rewards/accuracy_reward/std": 0.49396437406539917, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16289502382278442, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 895.294677734375, + "completions/mean_terminated_length": 696.1361083984375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5649139630280752, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13537411244193284, + "kl": 0.029296875, + "learning_rate": 5.270784544658207e-07, + "loss": 0.0844, + "num_tokens": 1488129525.0, + "reward": 2.396205425262451, + "reward_std": 0.37333884835243225, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11468179523944855, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1062.8504638671875, + "completions/mean_terminated_length": 845.4196166992188, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.5651270576953812, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12137100123020736, + "kl": 0.0257568359375, + "learning_rate": 5.267441294349619e-07, + "loss": 0.0601, + "num_tokens": 1488673474.0, + "reward": 2.431361675262451, + "reward_std": 0.4382311701774597, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13170984387397766, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1058.85498046875, + "completions/mean_terminated_length": 820.4736328125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5653401523626871, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.122792074999492, + "kl": 0.023193359375, + "learning_rate": 5.264098172744471e-07, + "loss": 0.0966, + "num_tokens": 1489218801.0, + "reward": 2.33984375, + "reward_std": 0.41300156712532043, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15291257202625275, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 901.3638916015625, + "completions/mean_terminated_length": 685.4190673828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5655532470299931, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.16285266573177537, + "kl": 0.031463623046875, + "learning_rate": 5.26075518169293e-07, + "loss": 0.0759, + "num_tokens": 1489691300.0, + "reward": 2.431361675262451, + "reward_std": 0.41083529591560364, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1017.69873046875, + "completions/mean_terminated_length": 845.9818115234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.565766341697299, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15405397292111547, + "kl": 0.02685546875, + "learning_rate": 5.257412323045081e-07, + "loss": 0.0605, + "num_tokens": 1490225421.0, + "reward": 2.5323662757873535, + "reward_std": 0.45641008019447327, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12956565618515015, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1095.587158203125, + "completions/mean_terminated_length": 842.6864624023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.565979436364605, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11830537465485597, + "kl": 0.02398681640625, + "learning_rate": 5.254069598650947e-07, + "loss": 0.0805, + "num_tokens": 1490781476.0, + "reward": 2.3364956378936768, + "reward_std": 0.38298436999320984, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14517304301261902, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 933.4464721679688, + "completions/mean_terminated_length": 757.7674560546875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.5661925310319109, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13954503308964486, + "kl": 0.028961181640625, + "learning_rate": 5.250727010360463e-07, + "loss": 0.1245, + "num_tokens": 1491266412.0, + "reward": 2.494419813156128, + "reward_std": 0.4833642542362213, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12585100531578064, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1095.9710693359375, + "completions/mean_terminated_length": 876.27197265625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5664056256992169, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12008218210858618, + "kl": 0.025604248046875, + "learning_rate": 5.247384560023498e-07, + "loss": 0.0799, + "num_tokens": 1491835007.0, + "reward": 2.2840402126312256, + "reward_std": 0.4294620454311371, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.13851940631866455, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1010.6406860351562, + "completions/mean_terminated_length": 818.5369873046875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5666187203665228, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13290692354104014, + "kl": 0.027862548828125, + "learning_rate": 5.244042249489844e-07, + "loss": 0.1051, + "num_tokens": 1492352286.0, + "reward": 2.3364956378936768, + "reward_std": 0.41447001695632935, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780035495758057, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 890.2545166015625, + "completions/mean_terminated_length": 738.227294921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5668318150338287, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.20374180216549947, + "kl": 0.030120849609375, + "learning_rate": 5.240700080609212e-07, + "loss": 0.0196, + "num_tokens": 1492831584.0, + "reward": 2.4073662757873535, + "reward_std": 0.38919177651405334, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1334015429019928, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1008.1027221679688, + "completions/mean_terminated_length": 792.27490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5670449097011347, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10491118610204067, + "kl": 0.026336669921875, + "learning_rate": 5.237358055231238e-07, + "loss": 0.019, + "num_tokens": 1493354734.0, + "reward": 2.4425225257873535, + "reward_std": 0.3357571065425873, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.09263478219509125, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 924.6563110351562, + "completions/mean_terminated_length": 760.8951416015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5672580043684406, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13595540863504654, + "kl": 0.02752685546875, + "learning_rate": 5.234016175205477e-07, + "loss": 0.058, + "num_tokens": 1493836452.0, + "reward": 2.51171875, + "reward_std": 0.4025529623031616, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 993.2232666015625, + "completions/mean_terminated_length": 787.893310546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5674710990357467, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1372326715577458, + "kl": 0.02716064453125, + "learning_rate": 5.230674442381405e-07, + "loss": 0.0613, + "num_tokens": 1494354856.0, + "reward": 2.4029018878936768, + "reward_std": 0.45049983263015747, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13189572095870972, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 931.3125610351562, + "completions/mean_terminated_length": 771.7857055664062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5676841937030526, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12982180455272763, + "kl": 0.029327392578125, + "learning_rate": 5.227332858608413e-07, + "loss": 0.0851, + "num_tokens": 1494843188.0, + "reward": 2.4888393878936768, + "reward_std": 0.3818797469139099, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09768717736005783, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 914.4754638671875, + "completions/mean_terminated_length": 701.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5678972883703586, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13302626754717684, + "kl": 0.029632568359375, + "learning_rate": 5.223991425735812e-07, + "loss": 0.0859, + "num_tokens": 1495320793.0, + "reward": 2.4441964626312256, + "reward_std": 0.39099055528640747, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12155693024396896, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 845.4777221679688, + "completions/mean_terminated_length": 687.5706787109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5681103830376645, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13556864439769312, + "kl": 0.031341552734375, + "learning_rate": 5.220650145612832e-07, + "loss": 0.0788, + "num_tokens": 1495759631.0, + "reward": 2.5106027126312256, + "reward_std": 0.3736709952354431, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1117081567645073, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1170.1317138671875, + "completions/mean_terminated_length": 891.2794189453125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.5683234777049704, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12414931320693118, + "kl": 0.022705078125, + "learning_rate": 5.217309020088611e-07, + "loss": 0.0774, + "num_tokens": 1496351306.0, + "reward": 2.275669813156128, + "reward_std": 0.4311106204986572, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.18299148976802826, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1125.200927734375, + "completions/mean_terminated_length": 853.1618041992188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5685365723722764, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12326672749206961, + "kl": 0.02325439453125, + "learning_rate": 5.213968051012212e-07, + "loss": 0.0622, + "num_tokens": 1496929204.0, + "reward": 2.3404018878936768, + "reward_std": 0.48235875368118286, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18239015340805054, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1079.29248046875, + "completions/mean_terminated_length": 896.8567504882812, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.5687496670395823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12728705641042465, + "kl": 0.025177001953125, + "learning_rate": 5.210627240232603e-07, + "loss": 0.1041, + "num_tokens": 1497482951.0, + "reward": 2.390625, + "reward_std": 0.5176438093185425, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.14942027628421783, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1069.8638916015625, + "completions/mean_terminated_length": 781.5115356445312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5689627617068883, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13078165031966787, + "kl": 0.02484130859375, + "learning_rate": 5.207286589598666e-07, + "loss": 0.0806, + "num_tokens": 1498035338.0, + "reward": 2.3565850257873535, + "reward_std": 0.46218574047088623, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15448831021785736, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1009.0692138671875, + "completions/mean_terminated_length": 800.1689453125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5691758563741942, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12732452105413625, + "kl": 0.026824951171875, + "learning_rate": 5.203946100959197e-07, + "loss": 0.0688, + "num_tokens": 1498555321.0, + "reward": 2.3984375, + "reward_std": 0.39427533745765686, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.1251746416091919, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1025.805908203125, + "completions/mean_terminated_length": 826.8186645507812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5693889510415002, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11646350761656792, + "kl": 0.025634765625, + "learning_rate": 5.200605776162898e-07, + "loss": 0.05, + "num_tokens": 1499077314.0, + "reward": 2.4112725257873535, + "reward_std": 0.37816962599754333, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09811913222074509, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1059.872802734375, + "completions/mean_terminated_length": 841.7847290039062, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5696020457088061, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12594720320446293, + "kl": 0.0244140625, + "learning_rate": 5.197265617058389e-07, + "loss": 0.0404, + "num_tokens": 1499620281.0, + "reward": 2.467076063156128, + "reward_std": 0.3397904932498932, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10366258770227432, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1003.9620971679688, + "completions/mean_terminated_length": 763.0302124023438, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5698151403761121, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13434399127226226, + "kl": 0.027496337890625, + "learning_rate": 5.193925625494185e-07, + "loss": 0.1021, + "num_tokens": 1500141208.0, + "reward": 2.349330425262451, + "reward_std": 0.4278067946434021, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12292034178972244, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 982.6942138671875, + "completions/mean_terminated_length": 833.6055908203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.570028235043418, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14180966152504046, + "kl": 0.02752685546875, + "learning_rate": 5.190585803318721e-07, + "loss": 0.095, + "num_tokens": 1500656063.0, + "reward": 2.400669813156128, + "reward_std": 0.492755651473999, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13399912416934967, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1058.9866943359375, + "completions/mean_terminated_length": 850.4918823242188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5702413297107239, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14256061389495808, + "kl": 0.03021240234375, + "learning_rate": 5.187246152380331e-07, + "loss": 0.0885, + "num_tokens": 1501198153.0, + "reward": 2.2583706378936768, + "reward_std": 0.3884522318840027, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211319923401, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 906.0178833007812, + "completions/mean_terminated_length": 759.3148193359375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.57045442437803, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1448747971532215, + "kl": 0.03021240234375, + "learning_rate": 5.183906674527256e-07, + "loss": 0.068, + "num_tokens": 1501680561.0, + "reward": 2.40625, + "reward_std": 0.4149862825870514, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13058778643608093, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 958.1719360351562, + "completions/mean_terminated_length": 717.6375732421875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5706675190453359, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13589082611372647, + "kl": 0.030731201171875, + "learning_rate": 5.180567371607641e-07, + "loss": 0.0673, + "num_tokens": 1502188846.0, + "reward": 2.4916296005249023, + "reward_std": 0.46413901448249817, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1477556824684143, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1067.9285888671875, + "completions/mean_terminated_length": 845.0630493164062, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5708806137126419, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12868667996485622, + "kl": 0.0279541015625, + "learning_rate": 5.177228245469537e-07, + "loss": 0.0583, + "num_tokens": 1502737710.0, + "reward": 2.3325893878936768, + "reward_std": 0.419612318277359, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14619383215904236, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 959.2545166015625, + "completions/mean_terminated_length": 764.4263305664062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5710937083799478, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14187563472226494, + "kl": 0.029876708984375, + "learning_rate": 5.173889297960893e-07, + "loss": 0.0961, + "num_tokens": 1503233248.0, + "reward": 2.4135046005249023, + "reward_std": 0.46477067470550537, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1590748131275177, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1069.8973388671875, + "completions/mean_terminated_length": 747.73291015625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5713068030472538, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12958603274778882, + "kl": 0.024810791015625, + "learning_rate": 5.170550530929561e-07, + "loss": 0.0662, + "num_tokens": 1503782290.0, + "reward": 2.2455358505249023, + "reward_std": 0.38819533586502075, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16025643050670624, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1049.2232666015625, + "completions/mean_terminated_length": 879.718017578125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5715198977145597, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11571750499002063, + "kl": 0.02734375, + "learning_rate": 5.167211946223292e-07, + "loss": 0.0853, + "num_tokens": 1504319478.0, + "reward": 2.3091518878936768, + "reward_std": 0.46825939416885376, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14213687181472778, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 914.591552734375, + "completions/mean_terminated_length": 742.6864013671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5717329923818656, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14757016144715496, + "kl": 0.031219482421875, + "learning_rate": 5.163873545689739e-07, + "loss": 0.1482, + "num_tokens": 1504806719.0, + "reward": 2.489955425262451, + "reward_std": 0.4557664394378662, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.1194591298699379, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 901.0357666015625, + "completions/mean_terminated_length": 706.3812255859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5719460870491716, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1361870645356449, + "kl": 0.028472900390625, + "learning_rate": 5.160535331176449e-07, + "loss": 0.0767, + "num_tokens": 1505278543.0, + "reward": 2.4933037757873535, + "reward_std": 0.4082051217556, + "rewards/accuracy_reward/mean": 0.5810185074806213, + "rewards/accuracy_reward/std": 0.49396437406539917, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493200302124, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12425874173641205, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1065.78125, + "completions/mean_terminated_length": 858.7189331054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5721591817164775, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12172322553595843, + "kl": 0.0263671875, + "learning_rate": 5.157197304530869e-07, + "loss": 0.0848, + "num_tokens": 1505831997.0, + "reward": 2.3013393878936768, + "reward_std": 0.4286647140979767, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13636787235736847, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 866.5313110351562, + "completions/mean_terminated_length": 718.1055297851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5723722763837835, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13484552029380745, + "kl": 0.03466796875, + "learning_rate": 5.153859467600342e-07, + "loss": 0.0419, + "num_tokens": 1506292683.0, + "reward": 2.4341518878936768, + "reward_std": 0.416586309671402, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 994.5558471679688, + "completions/mean_terminated_length": 779.3359985351562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5725853710510894, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11767949410187596, + "kl": 0.028900146484375, + "learning_rate": 5.150521822232106e-07, + "loss": 0.1094, + "num_tokens": 1506810356.0, + "reward": 2.4375, + "reward_std": 0.44715550541877747, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 992.4241333007812, + "completions/mean_terminated_length": 780.1769409179688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.5727984657183954, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1411065824735259, + "kl": 0.0284423828125, + "learning_rate": 5.147184370273292e-07, + "loss": 0.0787, + "num_tokens": 1507322946.0, + "reward": 2.5027902126312256, + "reward_std": 0.44862544536590576, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13700605928897858, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1024.41748046875, + "completions/mean_terminated_length": 774.2083740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5730115603857013, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11972005072785544, + "kl": 0.02655029296875, + "learning_rate": 5.143847113570921e-07, + "loss": 0.0401, + "num_tokens": 1507858749.0, + "reward": 2.3683037757873535, + "reward_std": 0.40954843163490295, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13106490671634674, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1116.5848388671875, + "completions/mean_terminated_length": 852.3724975585938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5732246550530073, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1171880841106862, + "kl": 0.024078369140625, + "learning_rate": 5.140510053971912e-07, + "loss": 0.1078, + "num_tokens": 1508429763.0, + "reward": 2.345982313156128, + "reward_std": 0.4725840091705322, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.18130891025066376, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1084.0023193359375, + "completions/mean_terminated_length": 817.5982666015625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5734377497203133, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13899350451442757, + "kl": 0.025543212890625, + "learning_rate": 5.137173193323071e-07, + "loss": 0.1189, + "num_tokens": 1508990196.0, + "reward": 2.2527902126312256, + "reward_std": 0.5071700811386108, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.16790206730365753, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1168.40625, + "completions/mean_terminated_length": 871.7074584960938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5736508443876192, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12812073413287842, + "kl": 0.022735595703125, + "learning_rate": 5.133836533471098e-07, + "loss": 0.0359, + "num_tokens": 1509589002.0, + "reward": 2.239955425262451, + "reward_std": 0.42624154686927795, + "rewards/accuracy_reward/mean": 0.3482142984867096, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559327840805, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 949.69873046875, + "completions/mean_terminated_length": 776.5814208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5738639390549252, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13048153114447855, + "kl": 0.0302734375, + "learning_rate": 5.130500076262575e-07, + "loss": 0.1147, + "num_tokens": 1510080451.0, + "reward": 2.310267925262451, + "reward_std": 0.43063560128211975, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15035313367843628, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 846.4063110351562, + "completions/mean_terminated_length": 674.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.5740770337222311, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.157715354974867, + "kl": 0.034637451171875, + "learning_rate": 5.127163823543975e-07, + "loss": 0.0597, + "num_tokens": 1510528537.0, + "reward": 2.443080425262451, + "reward_std": 0.40925729274749756, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14114972949028015, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 904.0781860351562, + "completions/mean_terminated_length": 757.1259155273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5742901283895371, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1231995259338818, + "kl": 0.028564453125, + "learning_rate": 5.123827777161662e-07, + "loss": 0.0303, + "num_tokens": 1511006892.0, + "reward": 2.5502233505249023, + "reward_std": 0.36223557591438293, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 963.622802734375, + "completions/mean_terminated_length": 759.4031982421875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.574503223056843, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13287240852693366, + "kl": 0.0286865234375, + "learning_rate": 5.12049193896188e-07, + "loss": 0.12, + "num_tokens": 1511509299.0, + "reward": 2.474330425262451, + "reward_std": 0.45131295919418335, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15610112249851227, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 974.7678833007812, + "completions/mean_terminated_length": 769.2553100585938, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.574716317724149, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13090804531145267, + "kl": 0.029083251953125, + "learning_rate": 5.117156310790762e-07, + "loss": 0.0788, + "num_tokens": 1512014219.0, + "reward": 2.3956475257873535, + "reward_std": 0.5151432156562805, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16028662025928497, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 863.2344360351562, + "completions/mean_terminated_length": 740.6724243164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5749294123914549, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12511088722009267, + "kl": 0.03094482421875, + "learning_rate": 5.113820894494324e-07, + "loss": 0.0635, + "num_tokens": 1512477364.0, + "reward": 2.4949777126312256, + "reward_std": 0.38574710488319397, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12086557596921921, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 958.32373046875, + "completions/mean_terminated_length": 763.3289794921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5751425070587608, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14338422206753407, + "kl": 0.0283203125, + "learning_rate": 5.110485691918458e-07, + "loss": 0.0809, + "num_tokens": 1512977445.0, + "reward": 2.404017925262451, + "reward_std": 0.3946557939052582, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12223286926746368, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 889.0870971679688, + "completions/mean_terminated_length": 726.898193359375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5753556017260668, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15457543743335214, + "kl": 0.030242919921875, + "learning_rate": 5.107150704908948e-07, + "loss": 0.1501, + "num_tokens": 1513439692.0, + "reward": 2.478794813156128, + "reward_std": 0.4300190508365631, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1502326875925064, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 992.5491333007812, + "completions/mean_terminated_length": 790.4414672851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5755686963933727, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11210175375752965, + "kl": 0.026458740234375, + "learning_rate": 5.103815935311452e-07, + "loss": 0.0593, + "num_tokens": 1513952002.0, + "reward": 2.364955425262451, + "reward_std": 0.4000357389450073, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14649668335914612, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 966.3170166015625, + "completions/mean_terminated_length": 752.2941284179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5757817910606787, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.7458028801964932, + "kl": 0.034698486328125, + "learning_rate": 5.100481384971511e-07, + "loss": 0.0885, + "num_tokens": 1514468112.0, + "reward": 2.39453125, + "reward_std": 0.3782336413860321, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15968577563762665, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 859.904052734375, + "completions/mean_terminated_length": 740.2186889648438, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5759948857279846, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15498907528530825, + "kl": 0.029541015625, + "learning_rate": 5.097147055734543e-07, + "loss": 0.0647, + "num_tokens": 1514923989.0, + "reward": 2.5, + "reward_std": 0.4385365843772888, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14672231674194336, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 973.0156860351562, + "completions/mean_terminated_length": 790.5770263671875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5762079803952906, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1496255367495632, + "kl": 0.03009033203125, + "learning_rate": 5.093812949445844e-07, + "loss": 0.0722, + "num_tokens": 1515424956.0, + "reward": 2.3627233505249023, + "reward_std": 0.435127317905426, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1576608121395111, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 900.1295166015625, + "completions/mean_terminated_length": 722.6237182617188, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5764210750625965, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1323522606038315, + "kl": 0.02960205078125, + "learning_rate": 5.090479067950587e-07, + "loss": 0.0835, + "num_tokens": 1515897350.0, + "reward": 2.5513393878936768, + "reward_std": 0.4319068491458893, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14238695800304413, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 997.2991333007812, + "completions/mean_terminated_length": 765.4005126953125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5766341697299026, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1320313717902181, + "kl": 0.025604248046875, + "learning_rate": 5.087145413093818e-07, + "loss": 0.0808, + "num_tokens": 1516418588.0, + "reward": 2.4581475257873535, + "reward_std": 0.441916823387146, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 909.66748046875, + "completions/mean_terminated_length": 730.2403564453125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.5768472643972085, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15513858688235604, + "kl": 0.0338134765625, + "learning_rate": 5.083811986720463e-07, + "loss": 0.0208, + "num_tokens": 1516895303.0, + "reward": 2.5050225257873535, + "reward_std": 0.411132276058197, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12450840324163437, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 978.3013916015625, + "completions/mean_terminated_length": 763.2144775390625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5770603590645144, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13063435725615272, + "kl": 0.030303955078125, + "learning_rate": 5.080478790675316e-07, + "loss": 0.0956, + "num_tokens": 1517403422.0, + "reward": 2.453125, + "reward_std": 0.3992384970188141, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14187753200531006, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 957.94873046875, + "completions/mean_terminated_length": 749.2153930664062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5772734537318204, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14200426956042472, + "kl": 0.03082275390625, + "learning_rate": 5.077145826803048e-07, + "loss": 0.0819, + "num_tokens": 1517908183.0, + "reward": 2.5223214626312256, + "reward_std": 0.4615324139595032, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764310240745544, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16317449510097504, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1007.6428833007812, + "completions/mean_terminated_length": 798.455810546875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5774865483991263, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11509399639090413, + "kl": 0.025054931640625, + "learning_rate": 5.073813096948197e-07, + "loss": 0.0749, + "num_tokens": 1518426423.0, + "reward": 2.3956475257873535, + "reward_std": 0.47592949867248535, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16767141222953796, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1022.29248046875, + "completions/mean_terminated_length": 768.0083618164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5776996430664323, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13347564688702754, + "kl": 0.0257568359375, + "learning_rate": 5.070480602955175e-07, + "loss": 0.0419, + "num_tokens": 1518952522.0, + "reward": 2.3521206378936768, + "reward_std": 0.4461229145526886, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13254131376743317, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 911.1138916015625, + "completions/mean_terminated_length": 752.0076293945312, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5779127377337382, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11090236767233502, + "kl": 0.0269775390625, + "learning_rate": 5.067148346668263e-07, + "loss": 0.0035, + "num_tokens": 1519430637.0, + "reward": 2.540736675262451, + "reward_std": 0.3817024528980255, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12673446536064148, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 924.3348388671875, + "completions/mean_terminated_length": 770.3299560546875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5781258324010442, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13829030450082372, + "kl": 0.030181884765625, + "learning_rate": 5.063816329931609e-07, + "loss": 0.079, + "num_tokens": 1519918515.0, + "reward": 2.3431921005249023, + "reward_std": 0.43478500843048096, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13776934146881104, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 885.0022583007812, + "completions/mean_terminated_length": 755.1389770507812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5783389270683501, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12903955672837428, + "kl": 0.0311279296875, + "learning_rate": 5.060484554589229e-07, + "loss": 0.0727, + "num_tokens": 1520378324.0, + "reward": 2.5128350257873535, + "reward_std": 0.4313536584377289, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1024.83935546875, + "completions/mean_terminated_length": 767.6201171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5785520217356561, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13420294520411027, + "kl": 0.02911376953125, + "learning_rate": 5.057153022485005e-07, + "loss": 0.0589, + "num_tokens": 1520907660.0, + "reward": 2.361049175262451, + "reward_std": 0.4494055509567261, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.1634640097618103, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1060.790283203125, + "completions/mean_terminated_length": 849.4363403320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.578765116402962, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12841310869216124, + "kl": 0.0244140625, + "learning_rate": 5.053821735462689e-07, + "loss": 0.0964, + "num_tokens": 1521456638.0, + "reward": 2.3872768878936768, + "reward_std": 0.45457085967063904, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13713014125823975, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 869.372802734375, + "completions/mean_terminated_length": 697.5524291992188, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5789782110702679, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13006710909228478, + "kl": 0.0323486328125, + "learning_rate": 5.050490695365889e-07, + "loss": 0.0311, + "num_tokens": 1521919797.0, + "reward": 2.428013563156128, + "reward_std": 0.4147588014602661, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1293378323316574, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1043.5379638671875, + "completions/mean_terminated_length": 804.9088745117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5791913057375739, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12511495016467727, + "kl": 0.027557373046875, + "learning_rate": 5.047159904038081e-07, + "loss": 0.0681, + "num_tokens": 1522455862.0, + "reward": 2.2840402126312256, + "reward_std": 0.4744742512702942, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.17796529829502106, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1008.7277221679688, + "completions/mean_terminated_length": 761.8287353515625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5794044004048798, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12937673598406707, + "kl": 0.02630615234375, + "learning_rate": 5.043829363322605e-07, + "loss": 0.0728, + "num_tokens": 1522987740.0, + "reward": 2.4497768878936768, + "reward_std": 0.4099780023097992, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1039.9732666015625, + "completions/mean_terminated_length": 817.4931640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5796174950721859, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1277239726023439, + "kl": 0.026458740234375, + "learning_rate": 5.040499075062658e-07, + "loss": 0.103, + "num_tokens": 1523522032.0, + "reward": 2.2963171005249023, + "reward_std": 0.4322567880153656, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.12884463369846344, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 981.1094360351562, + "completions/mean_terminated_length": 790.192138671875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5798305897394918, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12344985616488997, + "kl": 0.02581787109375, + "learning_rate": 5.037169041101303e-07, + "loss": 0.1146, + "num_tokens": 1524027953.0, + "reward": 2.3934152126312256, + "reward_std": 0.488517165184021, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14003422856330872, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 919.544677734375, + "completions/mean_terminated_length": 738.2901611328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5800436844067978, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13659903885833286, + "kl": 0.0299072265625, + "learning_rate": 5.033839263281457e-07, + "loss": 0.0799, + "num_tokens": 1524516197.0, + "reward": 2.4347100257873535, + "reward_std": 0.4202975034713745, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13566918671131134, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 954.1741333007812, + "completions/mean_terminated_length": 765.1884765625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5802567790741037, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15970958000916086, + "kl": 0.031097412109375, + "learning_rate": 5.030509743445897e-07, + "loss": 0.1002, + "num_tokens": 1525011987.0, + "reward": 2.357142925262451, + "reward_std": 0.44088101387023926, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401999473572, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1540280133485794, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1011.2500610351562, + "completions/mean_terminated_length": 809.4293212890625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5804698737414096, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12502603522582514, + "kl": 0.02569580078125, + "learning_rate": 5.027180483437258e-07, + "loss": 0.0604, + "num_tokens": 1525531315.0, + "reward": 2.4419643878936768, + "reward_std": 0.4300384819507599, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.13150234520435333, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1031.2723388671875, + "completions/mean_terminated_length": 803.4808349609375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5806829684087156, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12000109413842157, + "kl": 0.026885986328125, + "learning_rate": 5.023851485098028e-07, + "loss": 0.0612, + "num_tokens": 1526056493.0, + "reward": 2.4363839626312256, + "reward_std": 0.4176003634929657, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10928615182638168, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 977.0022583007812, + "completions/mean_terminated_length": 817.7256469726562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5808960630760215, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10548450722403173, + "kl": 0.027557373046875, + "learning_rate": 5.020522750270558e-07, + "loss": 0.0538, + "num_tokens": 1526562862.0, + "reward": 2.5083706378936768, + "reward_std": 0.3959433138370514, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15845361351966858, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1007.8482666015625, + "completions/mean_terminated_length": 828.1361083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5811091577433275, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1164570763765968, + "kl": 0.0264892578125, + "learning_rate": 5.017194280797042e-07, + "loss": 0.0745, + "num_tokens": 1527085466.0, + "reward": 2.462611675262451, + "reward_std": 0.435996949672699, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15332838892936707, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 866.216552734375, + "completions/mean_terminated_length": 704.2461547851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5813222524106334, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15686892639953445, + "kl": 0.0308837890625, + "learning_rate": 5.013866078519539e-07, + "loss": 0.0768, + "num_tokens": 1527535355.0, + "reward": 2.4363839626312256, + "reward_std": 0.36757248640060425, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1566121131181717, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 962.7120971679688, + "completions/mean_terminated_length": 788.3911743164062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5815353470779394, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.11359357169129575, + "kl": 0.026458740234375, + "learning_rate": 5.010538145279949e-07, + "loss": 0.0209, + "num_tokens": 1528038554.0, + "reward": 2.4793527126312256, + "reward_std": 0.3137418329715729, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1427011638879776, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1050.337158203125, + "completions/mean_terminated_length": 830.1444091796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5817484417452453, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12917058934510264, + "kl": 0.0240478515625, + "learning_rate": 5.00721048292003e-07, + "loss": 0.0802, + "num_tokens": 1528585713.0, + "reward": 2.4676339626312256, + "reward_std": 0.440024197101593, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13606999814510345, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 891.6563110351562, + "completions/mean_terminated_length": 691.869140625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.5819615364125513, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.14747479017746679, + "kl": 0.031890869140625, + "learning_rate": 5.00388309328139e-07, + "loss": 0.1074, + "num_tokens": 1529044791.0, + "reward": 2.5033483505249023, + "reward_std": 0.33555570244789124, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4846842288970947, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15610112249851227, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1040.180908203125, + "completions/mean_terminated_length": 843.9920043945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5821746310798572, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1224465610414118, + "kl": 0.025787353515625, + "learning_rate": 5.000555978205483e-07, + "loss": 0.0634, + "num_tokens": 1529574808.0, + "reward": 2.4520089626312256, + "reward_std": 0.3944771885871887, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13661938905715942, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1038.857177734375, + "completions/mean_terminated_length": 858.2737426757812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5823877257471631, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12699152942196926, + "kl": 0.027374267578125, + "learning_rate": 4.997229139533613e-07, + "loss": 0.0358, + "num_tokens": 1530111512.0, + "reward": 2.5, + "reward_std": 0.515058696269989, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.15048591792583466, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 870.8192138671875, + "completions/mean_terminated_length": 716.2398681640625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5826008204144691, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12121440941360011, + "kl": 0.028900146484375, + "learning_rate": 4.993902579106932e-07, + "loss": 0.0399, + "num_tokens": 1530572759.0, + "reward": 2.634486675262451, + "reward_std": 0.35433393716812134, + "rewards/accuracy_reward/mean": 0.7008928656578064, + "rewards/accuracy_reward/std": 0.45837873220443726, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12379446625709534, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 953.1920166015625, + "completions/mean_terminated_length": 770.7239990234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.582813915081775, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13344528560239113, + "kl": 0.031036376953125, + "learning_rate": 4.990576298766434e-07, + "loss": 0.077, + "num_tokens": 1531071245.0, + "reward": 2.4441964626312256, + "reward_std": 0.4658892750740051, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17707304656505585, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1032.5223388671875, + "completions/mean_terminated_length": 825.0591430664062, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5830270097490811, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12986098916455666, + "kl": 0.027984619140625, + "learning_rate": 4.987250300352961e-07, + "loss": 0.0291, + "num_tokens": 1531602455.0, + "reward": 2.376674175262451, + "reward_std": 0.4132971167564392, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 985.5938110351562, + "completions/mean_terminated_length": 808.5260620117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.583240104416387, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12021896563034838, + "kl": 0.028900146484375, + "learning_rate": 4.983924585707199e-07, + "loss": 0.0995, + "num_tokens": 1532118113.0, + "reward": 2.4637277126312256, + "reward_std": 0.4351500868797302, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15628793835639954, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 903.591552734375, + "completions/mean_terminated_length": 756.5767822265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.583453199083693, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12995656639750017, + "kl": 0.029449462890625, + "learning_rate": 4.980599156669676e-07, + "loss": 0.0491, + "num_tokens": 1532595322.0, + "reward": 2.5691964626312256, + "reward_std": 0.36906158924102783, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12425874918699265, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 967.1808471679688, + "completions/mean_terminated_length": 735.7859497070312, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5836662937509989, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12935954475736547, + "kl": 0.02703857421875, + "learning_rate": 4.977274015080764e-07, + "loss": 0.0804, + "num_tokens": 1533099003.0, + "reward": 2.4966518878936768, + "reward_std": 0.3702591061592102, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13399912416934967, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 992.1897583007812, + "completions/mean_terminated_length": 809.7722778320312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5838793884183048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13912510550054555, + "kl": 0.02740478515625, + "learning_rate": 4.973949162780673e-07, + "loss": 0.0787, + "num_tokens": 1533608608.0, + "reward": 2.4291296005249023, + "reward_std": 0.48057666420936584, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14680634438991547, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 933.3348388671875, + "completions/mean_terminated_length": 764.2724609375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.5840924830856108, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1335605794019026, + "kl": 0.03070068359375, + "learning_rate": 4.970624601609455e-07, + "loss": 0.0812, + "num_tokens": 1534096246.0, + "reward": 2.48828125, + "reward_std": 0.3715760409832001, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12493880838155746, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1005.83935546875, + "completions/mean_terminated_length": 819.347412109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5843055777529167, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13583239733942506, + "kl": 0.026092529296875, + "learning_rate": 4.967300333407e-07, + "loss": 0.0579, + "num_tokens": 1534622350.0, + "reward": 2.3526787757873535, + "reward_std": 0.4281512200832367, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1605832874774933, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1017.2277221679688, + "completions/mean_terminated_length": 803.2937622070312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5845186724202227, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14571441991549142, + "kl": 0.028350830078125, + "learning_rate": 4.963976360013036e-07, + "loss": 0.0777, + "num_tokens": 1535148868.0, + "reward": 2.4107143878936768, + "reward_std": 0.4748704433441162, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.17342577874660492, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 934.1116333007812, + "completions/mean_terminated_length": 731.3192749023438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5847317670875286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14844399066971034, + "kl": 0.0340576171875, + "learning_rate": 4.960652683267125e-07, + "loss": 0.0715, + "num_tokens": 1535634822.0, + "reward": 2.46875, + "reward_std": 0.46500444412231445, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12672585248947144, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 944.15185546875, + "completions/mean_terminated_length": 786.4591674804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5849448617548346, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12904060383443267, + "kl": 0.029052734375, + "learning_rate": 4.957329305008674e-07, + "loss": 0.0522, + "num_tokens": 1536129226.0, + "reward": 2.3950893878936768, + "reward_std": 0.3883579969406128, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10779669135808945, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1009.5469360351562, + "completions/mean_terminated_length": 797.3897705078125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5851579564221405, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.5190149681180043, + "kl": 0.030364990234375, + "learning_rate": 4.954006227076914e-07, + "loss": 0.0594, + "num_tokens": 1536656991.0, + "reward": 2.443080425262451, + "reward_std": 0.4300369322299957, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.14111433923244476, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 958.0960083007812, + "completions/mean_terminated_length": 763.060546875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.5853710510894465, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12931311569296555, + "kl": 0.032623291015625, + "learning_rate": 4.950683451310913e-07, + "loss": 0.0501, + "num_tokens": 1537160218.0, + "reward": 2.4715402126312256, + "reward_std": 0.3861773908138275, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12035840004682541, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 979.27685546875, + "completions/mean_terminated_length": 767.8181762695312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5855841457567524, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14186724362239053, + "kl": 0.026947021484375, + "learning_rate": 4.947360979549576e-07, + "loss": 0.1013, + "num_tokens": 1537672278.0, + "reward": 2.4676339626312256, + "reward_std": 0.39202460646629333, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1154.2679443359375, + "completions/mean_terminated_length": 900.7449951171875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5857972404240583, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11708431994437764, + "kl": 0.02374267578125, + "learning_rate": 4.944038813631636e-07, + "loss": 0.0659, + "num_tokens": 1538259838.0, + "reward": 2.3549108505249023, + "reward_std": 0.44388580322265625, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 926.27685546875, + "completions/mean_terminated_length": 791.6699829101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5860103350913644, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13073406091318251, + "kl": 0.0321044921875, + "learning_rate": 4.940716955395657e-07, + "loss": 0.0605, + "num_tokens": 1538740586.0, + "reward": 2.4737725257873535, + "reward_std": 0.3734135031700134, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12109260261058807, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1043.484375, + "completions/mean_terminated_length": 847.9386596679688, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.5862234297586703, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1340401628349535, + "kl": 0.025421142578125, + "learning_rate": 4.937395406680035e-07, + "loss": 0.1058, + "num_tokens": 1539274675.0, + "reward": 2.377232313156128, + "reward_std": 0.4413833022117615, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14285962283611298, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1029.3817138671875, + "completions/mean_terminated_length": 758.901123046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5864365244259763, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1188922580423902, + "kl": 0.02655029296875, + "learning_rate": 4.934074169322992e-07, + "loss": 0.0571, + "num_tokens": 1539816958.0, + "reward": 2.4369421005249023, + "reward_std": 0.36892932653427124, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.11001905798912048, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 890.794677734375, + "completions/mean_terminated_length": 701.4337768554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5866496190932822, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14437006202999375, + "kl": 0.029541015625, + "learning_rate": 4.930753245162577e-07, + "loss": 0.0791, + "num_tokens": 1540282626.0, + "reward": 2.5502233505249023, + "reward_std": 0.3586868941783905, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10928615182638168, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1022.26123046875, + "completions/mean_terminated_length": 802.6585693359375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5868627137605882, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1293845638069153, + "kl": 0.026702880859375, + "learning_rate": 4.927432636036669e-07, + "loss": 0.0534, + "num_tokens": 1540810983.0, + "reward": 2.3895089626312256, + "reward_std": 0.4244554340839386, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1375301331281662, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 951.6964721679688, + "completions/mean_terminated_length": 755.5158081054688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5870758084278941, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.23829936394360238, + "kl": 0.02923583984375, + "learning_rate": 4.924112343782971e-07, + "loss": 0.078, + "num_tokens": 1541307999.0, + "reward": 2.4637277126312256, + "reward_std": 0.3630683124065399, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10874075442552567, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1043.5982666015625, + "completions/mean_terminated_length": 791.094970703125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5872889030952, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12314977184411488, + "kl": 0.024566650390625, + "learning_rate": 4.920792370239009e-07, + "loss": 0.062, + "num_tokens": 1541848155.0, + "reward": 2.463169813156128, + "reward_std": 0.3783283531665802, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11141301691532135, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1028.04248046875, + "completions/mean_terminated_length": 813.0243530273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.587501997762506, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1397384329271977, + "kl": 0.0291748046875, + "learning_rate": 4.917472717242137e-07, + "loss": 0.0849, + "num_tokens": 1542375214.0, + "reward": 2.3560268878936768, + "reward_std": 0.49758273363113403, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1662929803133011, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 927.27685546875, + "completions/mean_terminated_length": 757.295654296875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5877150924298119, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 6.431515629211438, + "kl": 0.23150634765625, + "learning_rate": 4.914153386629528e-07, + "loss": 0.0893, + "num_tokens": 1542861162.0, + "reward": 2.4933037757873535, + "reward_std": 0.46236032247543335, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.49251341819763184, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11611520498991013, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1002.88623046875, + "completions/mean_terminated_length": 782.5648803710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5879281870971179, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13969255214275655, + "kl": 0.028961181640625, + "learning_rate": 4.910834380238175e-07, + "loss": 0.0727, + "num_tokens": 1543375863.0, + "reward": 2.38671875, + "reward_std": 0.46862998604774475, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14680634438991547, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 993.0982666015625, + "completions/mean_terminated_length": 791.095703125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5881412817644238, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12951751927128352, + "kl": 0.028961181640625, + "learning_rate": 4.907515699904897e-07, + "loss": 0.0467, + "num_tokens": 1543894035.0, + "reward": 2.3995537757873535, + "reward_std": 0.4175123870372772, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09730304032564163, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 907.0089721679688, + "completions/mean_terminated_length": 766.88720703125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5883543764317298, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1300193945552892, + "kl": 0.030517578125, + "learning_rate": 4.904197347466327e-07, + "loss": 0.0908, + "num_tokens": 1544365351.0, + "reward": 2.5200893878936768, + "reward_std": 0.4406583309173584, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12449963390827179, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1052.9398193359375, + "completions/mean_terminated_length": 799.2969360351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5885674710990357, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12146183009950676, + "kl": 0.02655029296875, + "learning_rate": 4.900879324758922e-07, + "loss": 0.1105, + "num_tokens": 1544911004.0, + "reward": 2.4581475257873535, + "reward_std": 0.468540757894516, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15641570091247559, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1054.0848388671875, + "completions/mean_terminated_length": 828.0712280273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5887805657663417, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12304890862021266, + "kl": 0.027862548828125, + "learning_rate": 4.897561633618951e-07, + "loss": 0.0652, + "num_tokens": 1545454002.0, + "reward": 2.3046875, + "reward_std": 0.47808051109313965, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18767967820167542, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1101.9085693359375, + "completions/mean_terminated_length": 860.7479248046875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.5889936604336476, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11027755035543774, + "kl": 0.0238037109375, + "learning_rate": 4.894244275882502e-07, + "loss": 0.0722, + "num_tokens": 1546016585.0, + "reward": 2.424107313156128, + "reward_std": 0.3524986505508423, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9866071343421936, + "rewards/tag_count_reward/std": 0.08746545761823654, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 990.5313110351562, + "completions/mean_terminated_length": 814.2864990234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5892067551009535, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12685517689018902, + "kl": 0.028961181640625, + "learning_rate": 4.890927253385481e-07, + "loss": 0.0365, + "num_tokens": 1546531415.0, + "reward": 2.407924175262451, + "reward_std": 0.43417537212371826, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1029.19873046875, + "completions/mean_terminated_length": 765.9129028320312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5894198497682596, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1348522930847106, + "kl": 0.02716064453125, + "learning_rate": 4.887610567963605e-07, + "loss": 0.0855, + "num_tokens": 1547063840.0, + "reward": 2.3911831378936768, + "reward_std": 0.3990918695926666, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1121.5826416015625, + "completions/mean_terminated_length": 801.648681640625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.5896329444355655, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1079506573575776, + "kl": 0.023712158203125, + "learning_rate": 4.884294221452405e-07, + "loss": 0.0397, + "num_tokens": 1547634789.0, + "reward": 2.2963171005249023, + "reward_std": 0.44277262687683105, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.16280589997768402, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1147.779052734375, + "completions/mean_terminated_length": 882.3959350585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5898460391028715, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11446543844527704, + "kl": 0.02337646484375, + "learning_rate": 4.880978215687223e-07, + "loss": 0.0517, + "num_tokens": 1548212754.0, + "reward": 2.2901787757873535, + "reward_std": 0.41123324632644653, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1007.49560546875, + "completions/mean_terminated_length": 794.9193725585938, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5900591337701774, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12570762342038644, + "kl": 0.02630615234375, + "learning_rate": 4.877662552503218e-07, + "loss": 0.0943, + "num_tokens": 1548724784.0, + "reward": 2.4029018878936768, + "reward_std": 0.4209842085838318, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12648425996303558, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1071.4754638671875, + "completions/mean_terminated_length": 819.1151733398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5902722284374834, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12278755986742323, + "kl": 0.028076171875, + "learning_rate": 4.874347233735358e-07, + "loss": 0.0826, + "num_tokens": 1549277637.0, + "reward": 2.3738839626312256, + "reward_std": 0.5288773775100708, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9475446343421936, + "rewards/tag_count_reward/std": 0.17865578830242157, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 966.6250610351562, + "completions/mean_terminated_length": 792.9326171875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.5904853231047893, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13897269767154788, + "kl": 0.0286865234375, + "learning_rate": 4.871032261218409e-07, + "loss": 0.0259, + "num_tokens": 1549782973.0, + "reward": 2.4447546005249023, + "reward_std": 0.4732365608215332, + "rewards/accuracy_reward/mean": 0.5879629850387573, + "rewards/accuracy_reward/std": 0.4927723705768585, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14680634438991547, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1020.08935546875, + "completions/mean_terminated_length": 796.6304321289062, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.5906984177720953, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13729784362572958, + "kl": 0.0264892578125, + "learning_rate": 4.867717636786964e-07, + "loss": 0.1184, + "num_tokens": 1550312885.0, + "reward": 2.3666296005249023, + "reward_std": 0.4789509177207947, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15291258692741394, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1119.8148193359375, + "completions/mean_terminated_length": 863.3076782226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5909115124394012, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11564290828594635, + "kl": 0.02325439453125, + "learning_rate": 4.864403362275407e-07, + "loss": 0.0478, + "num_tokens": 1550886450.0, + "reward": 2.4090402126312256, + "reward_std": 0.4232191741466522, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13064393401145935, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 908.12060546875, + "completions/mean_terminated_length": 745.2805786132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5911246071067071, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1324840132015721, + "kl": 0.031097412109375, + "learning_rate": 4.861089439517939e-07, + "loss": 0.0504, + "num_tokens": 1551360680.0, + "reward": 2.505580425262451, + "reward_std": 0.44155511260032654, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1006.15185546875, + "completions/mean_terminated_length": 796.6649169921875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5913377017740131, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13505613612280754, + "kl": 0.029571533203125, + "learning_rate": 4.857775870348562e-07, + "loss": 0.0751, + "num_tokens": 1551874876.0, + "reward": 2.3761162757873535, + "reward_std": 0.46554049849510193, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16289502382278442, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 943.357177734375, + "completions/mean_terminated_length": 724.7914428710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.591550796441319, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13175352288456296, + "kl": 0.028045654296875, + "learning_rate": 4.854462656601083e-07, + "loss": 0.0615, + "num_tokens": 1552366412.0, + "reward": 2.4464287757873535, + "reward_std": 0.4057818353176117, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12935835123062134, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 985.8795166015625, + "completions/mean_terminated_length": 802.3717651367188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.591763891108625, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11926111403859492, + "kl": 0.0260009765625, + "learning_rate": 4.851149800109113e-07, + "loss": 0.0725, + "num_tokens": 1552871334.0, + "reward": 2.5541296005249023, + "reward_std": 0.37367162108421326, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561822891235352, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 984.5513916015625, + "completions/mean_terminated_length": 774.1363525390625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5919769857759309, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1174080925434695, + "kl": 0.02813720703125, + "learning_rate": 4.84783730270606e-07, + "loss": 0.0592, + "num_tokens": 1553382413.0, + "reward": 2.4799108505249023, + "reward_std": 0.40824735164642334, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.1090860590338707, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 926.6295166015625, + "completions/mean_terminated_length": 753.2216186523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.592190080443237, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.145289494256476, + "kl": 0.032989501953125, + "learning_rate": 4.844525166225145e-07, + "loss": 0.0655, + "num_tokens": 1553862599.0, + "reward": 2.4877233505249023, + "reward_std": 0.470186710357666, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14495466649532318, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1056.7098388671875, + "completions/mean_terminated_length": 789.9320068359375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.5924031751105429, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11878164995385453, + "kl": 0.024688720703125, + "learning_rate": 4.841213392499375e-07, + "loss": 0.0758, + "num_tokens": 1554403861.0, + "reward": 2.3989956378936768, + "reward_std": 0.37878915667533875, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10703980922698975, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1004.0982666015625, + "completions/mean_terminated_length": 804.2020874023438, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.5926162697778488, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1243415865349804, + "kl": 0.02520751953125, + "learning_rate": 4.837901983361569e-07, + "loss": 0.0816, + "num_tokens": 1554922945.0, + "reward": 2.5379464626312256, + "reward_std": 0.36585915088653564, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10947445780038834, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1101.8504638671875, + "completions/mean_terminated_length": 826.4581909179688, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5928293644451548, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11381367930198392, + "kl": 0.024505615234375, + "learning_rate": 4.834590940644335e-07, + "loss": 0.0572, + "num_tokens": 1555484814.0, + "reward": 2.421875, + "reward_std": 0.41148841381073, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1282729059457779, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 986.060302734375, + "completions/mean_terminated_length": 809.0703125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.5930424591124607, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13663151511483282, + "kl": 0.0286865234375, + "learning_rate": 4.831280266180083e-07, + "loss": 0.127, + "num_tokens": 1555996793.0, + "reward": 2.4994421005249023, + "reward_std": 0.48856019973754883, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.17068946361541748, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 862.0156860351562, + "completions/mean_terminated_length": 689.1227416992188, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.5932555537797667, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13802137471156883, + "kl": 0.032012939453125, + "learning_rate": 4.827969961801017e-07, + "loss": 0.0757, + "num_tokens": 1556443440.0, + "reward": 2.4737725257873535, + "reward_std": 0.3226315677165985, + "rewards/accuracy_reward/mean": 0.5601851940155029, + "rewards/accuracy_reward/std": 0.496940016746521, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9871651530265808, + "rewards/tag_count_reward/std": 0.07646700739860535, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 785.794677734375, + "completions/mean_terminated_length": 655.2216796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5934686484470726, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14125665294641107, + "kl": 0.032501220703125, + "learning_rate": 4.824660029339137e-07, + "loss": 0.083, + "num_tokens": 1556856212.0, + "reward": 2.5558037757873535, + "reward_std": 0.40110641717910767, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1026.247802734375, + "completions/mean_terminated_length": 780.00830078125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5936817431143786, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 57.62522610721485, + "kl": 1.451904296875, + "learning_rate": 4.821350470626239e-07, + "loss": 0.1161, + "num_tokens": 1557390899.0, + "reward": 2.3292412757873535, + "reward_std": 0.372689813375473, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 938.4888916015625, + "completions/mean_terminated_length": 750.1906127929688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5938948377816845, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15258804010134852, + "kl": 0.029510498046875, + "learning_rate": 4.818041287493909e-07, + "loss": 0.0896, + "num_tokens": 1557883374.0, + "reward": 2.4542412757873535, + "reward_std": 0.4683377146720886, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15481625497341156, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1084.0960693359375, + "completions/mean_terminated_length": 803.5360107421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5941079324489905, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1069040004224171, + "kl": 0.02496337890625, + "learning_rate": 4.814732481773527e-07, + "loss": 0.0661, + "num_tokens": 1558439689.0, + "reward": 2.50390625, + "reward_std": 0.37032830715179443, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09830980002880096, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1000.8460083007812, + "completions/mean_terminated_length": 769.730224609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5943210271162964, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14165697878707825, + "kl": 0.0302734375, + "learning_rate": 4.811424055296263e-07, + "loss": 0.1026, + "num_tokens": 1558963748.0, + "reward": 2.439732313156128, + "reward_std": 0.4267694056034088, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15456202626228333, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 957.7678833007812, + "completions/mean_terminated_length": 795.6307983398438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5945341217836023, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13670294685240122, + "kl": 0.0308837890625, + "learning_rate": 4.808116009893079e-07, + "loss": 0.0163, + "num_tokens": 1559455212.0, + "reward": 2.443080425262451, + "reward_std": 0.3932071924209595, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12177752703428268, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 880.9464721679688, + "completions/mean_terminated_length": 757.0370483398438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5947472164509083, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13150223444352246, + "kl": 0.032440185546875, + "learning_rate": 4.804808347394724e-07, + "loss": 0.0812, + "num_tokens": 1559913428.0, + "reward": 2.5574777126312256, + "reward_std": 0.40665706992149353, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10962118953466415, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1015.6295166015625, + "completions/mean_terminated_length": 846.6961059570312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5949603111182142, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11295231772265461, + "kl": 0.028656005859375, + "learning_rate": 4.801501069631736e-07, + "loss": 0.1033, + "num_tokens": 1560431358.0, + "reward": 2.4877233505249023, + "reward_std": 0.443889319896698, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1401556432247162, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1027.9754638671875, + "completions/mean_terminated_length": 812.9432373046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5951734057855202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13960724355842696, + "kl": 0.029510498046875, + "learning_rate": 4.798194178434441e-07, + "loss": 0.0552, + "num_tokens": 1560966307.0, + "reward": 2.33203125, + "reward_std": 0.49274882674217224, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16544531285762787, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 965.77685546875, + "completions/mean_terminated_length": 798.4226684570312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5953865004528262, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1330063353301934, + "kl": 0.02777099609375, + "learning_rate": 4.794887675632951e-07, + "loss": 0.0752, + "num_tokens": 1561467279.0, + "reward": 2.4642858505249023, + "reward_std": 0.42790815234184265, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11805575340986252, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 952.5379638671875, + "completions/mean_terminated_length": 725.1778564453125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.5955995951201322, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13592517757588107, + "kl": 0.027984619140625, + "learning_rate": 4.791581563057156e-07, + "loss": 0.0983, + "num_tokens": 1561964080.0, + "reward": 2.4754464626312256, + "reward_std": 0.4108010232448578, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 990.8504638671875, + "completions/mean_terminated_length": 785.0586547851562, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.5958126897874381, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12712627560726303, + "kl": 0.027587890625, + "learning_rate": 4.788275842536746e-07, + "loss": 0.0847, + "num_tokens": 1562476605.0, + "reward": 2.4581475257873535, + "reward_std": 0.4451054632663727, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15730707347393036, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1090.0804443359375, + "completions/mean_terminated_length": 845.90478515625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.596025784454744, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13149659682070125, + "kl": 0.02325439453125, + "learning_rate": 4.784970515901176e-07, + "loss": 0.0802, + "num_tokens": 1563030945.0, + "reward": 2.3878350257873535, + "reward_std": 0.44034963846206665, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14177079498767853, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1184.6898193359375, + "completions/mean_terminated_length": 879.53173828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.59623887912205, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11223131520575452, + "kl": 0.02471923828125, + "learning_rate": 4.781665584979698e-07, + "loss": 0.0594, + "num_tokens": 1563638134.0, + "reward": 2.2901787757873535, + "reward_std": 0.4740681052207947, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.18907469511032104, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1040.02685546875, + "completions/mean_terminated_length": 824.2276611328125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.5964519737893559, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13729277910788348, + "kl": 0.027252197265625, + "learning_rate": 4.778361051601333e-07, + "loss": 0.1309, + "num_tokens": 1564179490.0, + "reward": 2.4207589626312256, + "reward_std": 0.5194218754768372, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17913034558296204, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1068.0223388671875, + "completions/mean_terminated_length": 811.2957763671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5966650684566619, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10694725886532257, + "kl": 0.025848388671875, + "learning_rate": 4.775056917594889e-07, + "loss": 0.018, + "num_tokens": 1564734268.0, + "reward": 2.3152902126312256, + "reward_std": 0.3530708849430084, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1240563914179802, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 969.6785888671875, + "completions/mean_terminated_length": 742.3567504882812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5968781631239678, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1353107002243978, + "kl": 0.031005859375, + "learning_rate": 4.771753184788952e-07, + "loss": 0.0933, + "num_tokens": 1565241580.0, + "reward": 2.43359375, + "reward_std": 0.40118223428726196, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 975.1785888671875, + "completions/mean_terminated_length": 769.74462890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5970912577912738, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14500113807686824, + "kl": 0.0289306640625, + "learning_rate": 4.768449855011884e-07, + "loss": 0.061, + "num_tokens": 1565752492.0, + "reward": 2.5184152126312256, + "reward_std": 0.4240882098674774, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14249980449676514, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 990.0938110351562, + "completions/mean_terminated_length": 829.6400756835938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5973043524585797, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14096468938547524, + "kl": 0.02813720703125, + "learning_rate": 4.765146930091826e-07, + "loss": 0.0968, + "num_tokens": 1566261910.0, + "reward": 2.4380581378936768, + "reward_std": 0.4481762647628784, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14732414484024048, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1022.08935546875, + "completions/mean_terminated_length": 805.8162231445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5975174471258857, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13166898100986094, + "kl": 0.02740478515625, + "learning_rate": 4.7618444118566934e-07, + "loss": 0.073, + "num_tokens": 1566783246.0, + "reward": 2.396763563156128, + "reward_std": 0.45847225189208984, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1475696861743927, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 1023.6563110351562, + "completions/mean_terminated_length": 811.0565795898438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5977305417931916, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 7.285055609327246, + "kl": 0.039093017578125, + "learning_rate": 4.7585423021341795e-07, + "loss": 0.0503, + "num_tokens": 1567308244.0, + "reward": 2.388951063156128, + "reward_std": 0.4243941605091095, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14443159103393555, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1077.51123046875, + "completions/mean_terminated_length": 863.3160400390625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5979436364604975, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16797211907182072, + "kl": 0.025299072265625, + "learning_rate": 4.7552406027517477e-07, + "loss": 0.075, + "num_tokens": 1567867113.0, + "reward": 2.329799175262451, + "reward_std": 0.4269038736820221, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15912973880767822, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1091.1317138671875, + "completions/mean_terminated_length": 873.54248046875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5981567311278035, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11755510535573062, + "kl": 0.024322509765625, + "learning_rate": 4.751939315536634e-07, + "loss": 0.0657, + "num_tokens": 1568432404.0, + "reward": 2.3872768878936768, + "reward_std": 0.44564881920814514, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709372282028198, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 923.93310546875, + "completions/mean_terminated_length": 753.4447021484375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5983698257951094, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13295521685144893, + "kl": 0.0301513671875, + "learning_rate": 4.748638442315851e-07, + "loss": 0.0688, + "num_tokens": 1568923174.0, + "reward": 2.454799175262451, + "reward_std": 0.4380059242248535, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1475696861743927, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1117.7054443359375, + "completions/mean_terminated_length": 870.677978515625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5985829204624155, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11375088305490733, + "kl": 0.02398681640625, + "learning_rate": 4.745337984916178e-07, + "loss": 0.0705, + "num_tokens": 1569495170.0, + "reward": 2.279576063156128, + "reward_std": 0.4200153946876526, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.8727678656578064, + "rewards/format_reward/std": 0.3336053788661957, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 830.2835083007812, + "completions/mean_terminated_length": 710.8995361328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5987960151297214, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12982746662993483, + "kl": 0.0341796875, + "learning_rate": 4.7420379451641656e-07, + "loss": 0.0565, + "num_tokens": 1569932097.0, + "reward": 2.5825893878936768, + "reward_std": 0.49067869782447815, + "rewards/accuracy_reward/mean": 0.6964285969734192, + "rewards/accuracy_reward/std": 0.4603137671947479, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1353386640548706, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 951.3482666015625, + "completions/mean_terminated_length": 778.490966796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5990091097970274, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15890112557891883, + "kl": 0.02825927734375, + "learning_rate": 4.7387383248861347e-07, + "loss": 0.1304, + "num_tokens": 1570423549.0, + "reward": 2.515625, + "reward_std": 0.42147308588027954, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1299937218427658, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1003.9910888671875, + "completions/mean_terminated_length": 826.8093872070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5992222044643333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13162983999777622, + "kl": 0.02777099609375, + "learning_rate": 4.7354391259081707e-07, + "loss": 0.0904, + "num_tokens": 1570948633.0, + "reward": 2.458705425262451, + "reward_std": 0.5123863816261292, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14262786507606506, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 943.4844360351562, + "completions/mean_terminated_length": 738.9443969726562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5994352991316392, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14226413764588752, + "kl": 0.028289794921875, + "learning_rate": 4.732140350056129e-07, + "loss": 0.1182, + "num_tokens": 1571432978.0, + "reward": 2.439732313156128, + "reward_std": 0.3928465247154236, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12951265275478363, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1076.087158203125, + "completions/mean_terminated_length": 778.5626831054688, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5996483937989452, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1275971133564519, + "kl": 0.0255126953125, + "learning_rate": 4.728841999155626e-07, + "loss": 0.0985, + "num_tokens": 1571982745.0, + "reward": 2.3627233505249023, + "reward_std": 0.45270001888275146, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1693282276391983, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1014.97998046875, + "completions/mean_terminated_length": 813.8853149414062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.5998614884662511, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1284774697540026, + "kl": 0.025665283203125, + "learning_rate": 4.725544075032053e-07, + "loss": 0.0887, + "num_tokens": 1572511664.0, + "reward": 2.3169643878936768, + "reward_std": 0.42080557346343994, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1033.4554443359375, + "completions/mean_terminated_length": 819.578369140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6000745831335571, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13404899368190185, + "kl": 0.0263671875, + "learning_rate": 4.7222465795105525e-07, + "loss": 0.0839, + "num_tokens": 1573042972.0, + "reward": 2.513392925262451, + "reward_std": 0.46064209938049316, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13212740421295166, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 924.2969360351562, + "completions/mean_terminated_length": 740.4181518554688, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.600287677800863, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14049292357184728, + "kl": 0.028717041015625, + "learning_rate": 4.7189495144160405e-07, + "loss": 0.1036, + "num_tokens": 1573524961.0, + "reward": 2.462611675262451, + "reward_std": 0.4035714864730835, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 997.7098388671875, + "completions/mean_terminated_length": 758.876708984375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.600500772468169, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1363837907245474, + "kl": 0.025146484375, + "learning_rate": 4.715652881573187e-07, + "loss": 0.0613, + "num_tokens": 1574038591.0, + "reward": 2.439174175262451, + "reward_std": 0.3859565854072571, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09542291611433029, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 973.1451416015625, + "completions/mean_terminated_length": 746.5540771484375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6007138671354749, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1256368810310527, + "kl": 0.02777099609375, + "learning_rate": 4.7123566828064265e-07, + "loss": 0.0633, + "num_tokens": 1574545104.0, + "reward": 2.48828125, + "reward_std": 0.36962586641311646, + "rewards/accuracy_reward/mean": 0.5879629850387573, + "rewards/accuracy_reward/std": 0.4927723705768585, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1340305656194687, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1056.946533203125, + "completions/mean_terminated_length": 811.2534790039062, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6009269618027809, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13034787992930544, + "kl": 0.02667236328125, + "learning_rate": 4.709060919939953e-07, + "loss": 0.0358, + "num_tokens": 1575085288.0, + "reward": 2.2393975257873535, + "reward_std": 0.4536352753639221, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.151995450258255, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 968.4241333007812, + "completions/mean_terminated_length": 708.249267578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6011400564700868, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.137201490972725, + "kl": 0.0281982421875, + "learning_rate": 4.7057655947977183e-07, + "loss": 0.1299, + "num_tokens": 1575584374.0, + "reward": 2.44921875, + "reward_std": 0.4403477609157562, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15014436841011047, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 948.37060546875, + "completions/mean_terminated_length": 781.5886840820312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6013531511373927, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1308392054316944, + "kl": 0.028656005859375, + "learning_rate": 4.702470709203435e-07, + "loss": 0.0047, + "num_tokens": 1576075836.0, + "reward": 2.4620537757873535, + "reward_std": 0.44590461254119873, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1523328423500061, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 967.9598388671875, + "completions/mean_terminated_length": 722.3616333007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6015662458046988, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13182988550173652, + "kl": 0.029541015625, + "learning_rate": 4.699176264980569e-07, + "loss": 0.0847, + "num_tokens": 1576580346.0, + "reward": 2.4419643878936768, + "reward_std": 0.4264742434024811, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1070.727783203125, + "completions/mean_terminated_length": 889.7512817382812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6017793404720047, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1202016024418567, + "kl": 0.02484130859375, + "learning_rate": 4.695882263952341e-07, + "loss": 0.0275, + "num_tokens": 1577135360.0, + "reward": 2.408482313156128, + "reward_std": 0.3730555474758148, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09768717736005783, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 817.4910888671875, + "completions/mean_terminated_length": 686.844482421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6019924351393107, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15864738773701906, + "kl": 0.035888671875, + "learning_rate": 4.6925887079417305e-07, + "loss": 0.0994, + "num_tokens": 1577560972.0, + "reward": 2.646763563156128, + "reward_std": 0.42769160866737366, + "rewards/accuracy_reward/mean": 0.7321428656578064, + "rewards/accuracy_reward/std": 0.4433377683162689, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14851415157318115, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1127.1585693359375, + "completions/mean_terminated_length": 872.680908203125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.6022055298066166, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1208160039068748, + "kl": 0.02618408203125, + "learning_rate": 4.6892955987714676e-07, + "loss": 0.0887, + "num_tokens": 1578139043.0, + "reward": 2.21875, + "reward_std": 0.46884554624557495, + "rewards/accuracy_reward/mean": 0.3392857015132904, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15883232653141022, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 880.4129638671875, + "completions/mean_terminated_length": 727.0934448242188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6024186244739226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1362210172183119, + "kl": 0.0322265625, + "learning_rate": 4.686002938264038e-07, + "loss": 0.0617, + "num_tokens": 1578600956.0, + "reward": 2.576451063156128, + "reward_std": 0.4683157503604889, + "rewards/accuracy_reward/mean": 0.6941964030265808, + "rewards/accuracy_reward/std": 0.4612620174884796, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.12749071419239044, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 883.0000610351562, + "completions/mean_terminated_length": 736.6431884765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6026317191412285, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1524470114260447, + "kl": 0.031646728515625, + "learning_rate": 4.682710728241673e-07, + "loss": 0.0678, + "num_tokens": 1579060588.0, + "reward": 2.46875, + "reward_std": 0.4039042592048645, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12842853367328644, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 832.5022583007812, + "completions/mean_terminated_length": 703.4494018554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6028448138085345, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14810856417970533, + "kl": 0.03253173828125, + "learning_rate": 4.679418970526364e-07, + "loss": 0.0833, + "num_tokens": 1579500605.0, + "reward": 2.400669813156128, + "reward_std": 0.42462319135665894, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13189572095870972, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1026.5, + "completions/mean_terminated_length": 787.3057861328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6030579084758404, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11555062285982094, + "kl": 0.026397705078125, + "learning_rate": 4.6761276669398465e-07, + "loss": 0.0185, + "num_tokens": 1580030573.0, + "reward": 2.423549175262451, + "reward_std": 0.3821007013320923, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11214316636323929, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 921.07373046875, + "completions/mean_terminated_length": 729.81982421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6032710031431463, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13019577256502068, + "kl": 0.029510498046875, + "learning_rate": 4.672836819303599e-07, + "loss": 0.0336, + "num_tokens": 1580518030.0, + "reward": 2.463169813156128, + "reward_std": 0.37460607290267944, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.14405618607997894, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1045.169677734375, + "completions/mean_terminated_length": 827.1630859375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6034840978104523, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12102380319822655, + "kl": 0.028289794921875, + "learning_rate": 4.669546429438862e-07, + "loss": 0.0772, + "num_tokens": 1581056490.0, + "reward": 2.342076063156128, + "reward_std": 0.45620623230934143, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15941192209720612, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 744.4910888671875, + "completions/mean_terminated_length": 569.5899047851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6036971924777582, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1344276443354919, + "kl": 0.03466796875, + "learning_rate": 4.6662564991666063e-07, + "loss": 0.0498, + "num_tokens": 1581453014.0, + "reward": 2.6060268878936768, + "reward_std": 0.33785903453826904, + "rewards/accuracy_reward/mean": 0.6808035969734192, + "rewards/accuracy_reward/std": 0.4666863977909088, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389532685279846, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 942.2902221679688, + "completions/mean_terminated_length": 698.2506713867188, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.6039102871450642, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14736744353647038, + "kl": 0.0316162109375, + "learning_rate": 4.662967030307565e-07, + "loss": 0.0617, + "num_tokens": 1581937944.0, + "reward": 2.4614956378936768, + "reward_std": 0.4201875925064087, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12015076726675034, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 922.1428833007812, + "completions/mean_terminated_length": 758.0153198242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6041233818123701, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.20228246371415648, + "kl": 0.02874755859375, + "learning_rate": 4.6596780246822023e-07, + "loss": 0.0556, + "num_tokens": 1582418504.0, + "reward": 2.4693081378936768, + "reward_std": 0.4557398557662964, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12848563492298126, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1049.357177734375, + "completions/mean_terminated_length": 838.8324584960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6043364764796761, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.7432683201551241, + "kl": 0.04852294921875, + "learning_rate": 4.656389484110729e-07, + "loss": 0.0916, + "num_tokens": 1582955544.0, + "reward": 2.4302456378936768, + "reward_std": 0.47149714827537537, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13254131376743317, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 995.216552734375, + "completions/mean_terminated_length": 759.3469848632812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.604549571146982, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13523759984680744, + "kl": 0.027313232421875, + "learning_rate": 4.653101410413106e-07, + "loss": 0.0373, + "num_tokens": 1583469561.0, + "reward": 2.3660714626312256, + "reward_std": 0.42035916447639465, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16285285353660583, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 964.8125610351562, + "completions/mean_terminated_length": 760.8169555664062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.604762665814288, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.5521666735325894, + "kl": 0.0693359375, + "learning_rate": 4.6498138054090254e-07, + "loss": 0.0897, + "num_tokens": 1583975621.0, + "reward": 2.4330358505249023, + "reward_std": 0.4382622241973877, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 964.8058471679688, + "completions/mean_terminated_length": 780.973876953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.604975760481594, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13942987974062662, + "kl": 0.0311279296875, + "learning_rate": 4.6465266709179297e-07, + "loss": 0.129, + "num_tokens": 1584473774.0, + "reward": 2.4575893878936768, + "reward_std": 0.4858267605304718, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.15674357116222382, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 950.46435546875, + "completions/mean_terminated_length": 700.8876953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6051888551488999, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13767875116422879, + "kl": 0.028045654296875, + "learning_rate": 4.6432400087589916e-07, + "loss": 0.064, + "num_tokens": 1584968302.0, + "reward": 2.4174108505249023, + "reward_std": 0.38686221837997437, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12537893652915955, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 975.8594360351562, + "completions/mean_terminated_length": 777.3148193359375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6054019498162059, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14396323007202966, + "kl": 0.028289794921875, + "learning_rate": 4.6399538207511314e-07, + "loss": 0.0719, + "num_tokens": 1585478511.0, + "reward": 2.4693081378936768, + "reward_std": 0.4607437551021576, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12265980988740921, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 873.1897583007812, + "completions/mean_terminated_length": 728.914794921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6056150444835118, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13316904134047614, + "kl": 0.030303955078125, + "learning_rate": 4.636668108713001e-07, + "loss": 0.0513, + "num_tokens": 1585937620.0, + "reward": 2.5825893878936768, + "reward_std": 0.4196329712867737, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.4769369065761566, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09178353101015091, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 986.6808471679688, + "completions/mean_terminated_length": 769.8521728515625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6058281391508178, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13615299827012922, + "kl": 0.0284423828125, + "learning_rate": 4.6333828744629864e-07, + "loss": 0.0606, + "num_tokens": 1586442117.0, + "reward": 2.4012277126312256, + "reward_std": 0.42455416917800903, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14223673939704895, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 967.966552734375, + "completions/mean_terminated_length": 696.44970703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6060412338181237, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13409888908545659, + "kl": 0.029693603515625, + "learning_rate": 4.63009811981922e-07, + "loss": 0.0666, + "num_tokens": 1586945126.0, + "reward": 2.428013563156128, + "reward_std": 0.40444400906562805, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13311463594436646, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1013.9688110351562, + "completions/mean_terminated_length": 828.931640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6062543284854297, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1336980293682955, + "kl": 0.0277099609375, + "learning_rate": 4.6268138465995577e-07, + "loss": 0.0952, + "num_tokens": 1587465000.0, + "reward": 2.400111675262451, + "reward_std": 0.5023146867752075, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920448660850525, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.16255265474319458, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1053.9554443359375, + "completions/mean_terminated_length": 804.0558471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6064674231527356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13610729634938068, + "kl": 0.02557373046875, + "learning_rate": 4.623530056621595e-07, + "loss": 0.1131, + "num_tokens": 1588008228.0, + "reward": 2.3705358505249023, + "reward_std": 0.5395359992980957, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15850186347961426, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1048.05810546875, + "completions/mean_terminated_length": 782.5367431640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6066805178200415, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12788533250369888, + "kl": 0.02960205078125, + "learning_rate": 4.6202467517026577e-07, + "loss": 0.0909, + "num_tokens": 1588540974.0, + "reward": 2.4068081378936768, + "reward_std": 0.4975346028804779, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.1746300309896469, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 950.357177734375, + "completions/mean_terminated_length": 790.3427124023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6068936124873475, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13990166966270953, + "kl": 0.027801513671875, + "learning_rate": 4.6169639336598044e-07, + "loss": 0.0545, + "num_tokens": 1589033070.0, + "reward": 2.5122768878936768, + "reward_std": 0.4686611592769623, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13503853976726532, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 986.1607666015625, + "completions/mean_terminated_length": 779.4559936523438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6071067071546534, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1353569825824958, + "kl": 0.029083251953125, + "learning_rate": 4.613681604309824e-07, + "loss": 0.0398, + "num_tokens": 1589540198.0, + "reward": 2.3515625, + "reward_std": 0.35539567470550537, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12016765773296356, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1110.38623046875, + "completions/mean_terminated_length": 816.1788940429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6073198018219594, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15425632903868391, + "kl": 0.026641845703125, + "learning_rate": 4.6103997654692306e-07, + "loss": 0.0909, + "num_tokens": 1590106051.0, + "reward": 2.3387277126312256, + "reward_std": 0.44057029485702515, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661912620067596, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 949.7433471679688, + "completions/mean_terminated_length": 805.5277709960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6075328964892653, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12497472974267625, + "kl": 0.02825927734375, + "learning_rate": 4.6071184189542776e-07, + "loss": 0.0298, + "num_tokens": 1590596640.0, + "reward": 2.4676339626312256, + "reward_std": 0.3878638446331024, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.08062524348497391, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 754.4017944335938, + "completions/mean_terminated_length": 651.537353515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6077459911565714, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.16273375666563764, + "kl": 0.035888671875, + "learning_rate": 4.6038375665809337e-07, + "loss": 0.0983, + "num_tokens": 1590993412.0, + "reward": 2.62109375, + "reward_std": 0.3242541253566742, + "rewards/accuracy_reward/mean": 0.6941964030265808, + "rewards/accuracy_reward/std": 0.4612620174884796, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067524552345276, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 933.0357666015625, + "completions/mean_terminated_length": 753.9481811523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6079590858238773, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12126863403179937, + "kl": 0.028228759765625, + "learning_rate": 4.6005572101649003e-07, + "loss": 0.0179, + "num_tokens": 1591480628.0, + "reward": 2.49609375, + "reward_std": 0.35623809695243835, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10500273108482361, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 982.7500610351562, + "completions/mean_terminated_length": 811.6476440429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6081721804911832, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1373511993335161, + "kl": 0.030120849609375, + "learning_rate": 4.5972773515216067e-07, + "loss": 0.0583, + "num_tokens": 1591987828.0, + "reward": 2.4464287757873535, + "reward_std": 0.5139682292938232, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13888955116271973, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 949.8973388671875, + "completions/mean_terminated_length": 789.8158569335938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6083852751584892, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11349545906335115, + "kl": 0.028594970703125, + "learning_rate": 4.5939979924662e-07, + "loss": 0.0426, + "num_tokens": 1592484502.0, + "reward": 2.47265625, + "reward_std": 0.3707757294178009, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1276179403066635, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 973.2098388671875, + "completions/mean_terminated_length": 825.903564453125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.6085983698257951, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.129508776677952, + "kl": 0.029876708984375, + "learning_rate": 4.5907191348135564e-07, + "loss": 0.0723, + "num_tokens": 1592990564.0, + "reward": 2.4285714626312256, + "reward_std": 0.4103049635887146, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10110349208116531, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 864.2142944335938, + "completions/mean_terminated_length": 715.4974975585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6088114644931011, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14405055101098838, + "kl": 0.032867431640625, + "learning_rate": 4.5874407803782713e-07, + "loss": 0.0232, + "num_tokens": 1593444628.0, + "reward": 2.5385046005249023, + "reward_std": 0.4429364502429962, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10190140455961227, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1049.712158203125, + "completions/mean_terminated_length": 874.1600952148438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.609024559160407, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13296464399506383, + "kl": 0.025848388671875, + "learning_rate": 4.5841629309746654e-07, + "loss": 0.0612, + "num_tokens": 1593982595.0, + "reward": 2.4620537757873535, + "reward_std": 0.48604339361190796, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14955390989780426, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1066.8192138671875, + "completions/mean_terminated_length": 820.1536254882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.609237653827713, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12781966337556225, + "kl": 0.025177001953125, + "learning_rate": 4.5808855884167764e-07, + "loss": 0.1013, + "num_tokens": 1594529522.0, + "reward": 2.443638563156128, + "reward_std": 0.466962993144989, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 942.2098388671875, + "completions/mean_terminated_length": 754.5430908203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6094507484950189, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12343636309814414, + "kl": 0.029388427734375, + "learning_rate": 4.5776087545183595e-07, + "loss": 0.066, + "num_tokens": 1595017664.0, + "reward": 2.560826063156128, + "reward_std": 0.36852946877479553, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.13000212609767914, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 893.5558471679688, + "completions/mean_terminated_length": 704.646728515625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6096638431623249, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13040082121280464, + "kl": 0.030975341796875, + "learning_rate": 4.5743324310928954e-07, + "loss": 0.0893, + "num_tokens": 1595479465.0, + "reward": 2.4799108505249023, + "reward_std": 0.3984030783176422, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1249600425362587, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 970.6741333007812, + "completions/mean_terminated_length": 781.2230834960938, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6098769378296308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13817923490354794, + "kl": 0.031524658203125, + "learning_rate": 4.571056619953577e-07, + "loss": 0.1226, + "num_tokens": 1595988983.0, + "reward": 2.607701063156128, + "reward_std": 0.5265092849731445, + "rewards/accuracy_reward/mean": 0.7455357313156128, + "rewards/accuracy_reward/std": 0.4360465407371521, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16232208907604218, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1057.265625, + "completions/mean_terminated_length": 825.2755126953125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6100900324969367, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1273777751826026, + "kl": 0.027801513671875, + "learning_rate": 4.5677813229133147e-07, + "loss": 0.0841, + "num_tokens": 1596527566.0, + "reward": 2.4213171005249023, + "reward_std": 0.46274977922439575, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14706972241401672, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 964.8192138671875, + "completions/mean_terminated_length": 732.918701171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6103031271642427, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.7698109684279082, + "kl": 0.035400390625, + "learning_rate": 4.564506541784734e-07, + "loss": 0.0905, + "num_tokens": 1597032285.0, + "reward": 2.369419813156128, + "reward_std": 0.46132078766822815, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.20024341344833374, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1034.2098388671875, + "completions/mean_terminated_length": 813.8206787109375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6105162218315486, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1295568786814832, + "kl": 0.026336669921875, + "learning_rate": 4.561232278380177e-07, + "loss": 0.0698, + "num_tokens": 1597562315.0, + "reward": 2.3828125, + "reward_std": 0.43705669045448303, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1549774408340454, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1052.982177734375, + "completions/mean_terminated_length": 839.9566650390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6107293164988546, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11780500883098978, + "kl": 0.025787353515625, + "learning_rate": 4.557958534511699e-07, + "loss": 0.0589, + "num_tokens": 1598099555.0, + "reward": 2.385044813156128, + "reward_std": 0.390415221452713, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.15348808467388153, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1083.1585693359375, + "completions/mean_terminated_length": 823.4985961914062, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.6109424111661605, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10853455398672648, + "kl": 0.025146484375, + "learning_rate": 4.554685311991062e-07, + "loss": 0.0302, + "num_tokens": 1598664218.0, + "reward": 2.3604912757873535, + "reward_std": 0.39786389470100403, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14550481736660004, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 974.57373046875, + "completions/mean_terminated_length": 755.271484375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6111555058334666, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13763119087535428, + "kl": 0.030059814453125, + "learning_rate": 4.5514126126297504e-07, + "loss": 0.0783, + "num_tokens": 1599170971.0, + "reward": 2.416294813156128, + "reward_std": 0.4127218425273895, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14262787997722626, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 919.466552734375, + "completions/mean_terminated_length": 751.6333618164062, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6113686005007725, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13196013204500898, + "kl": 0.031158447265625, + "learning_rate": 4.5481404382389464e-07, + "loss": 0.0799, + "num_tokens": 1599647756.0, + "reward": 2.556919813156128, + "reward_std": 0.350829541683197, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13351379334926605, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 844.7611694335938, + "completions/mean_terminated_length": 672.869873046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6115816951680784, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12061057348989138, + "kl": 0.030731201171875, + "learning_rate": 4.5448687906295535e-07, + "loss": 0.0062, + "num_tokens": 1600092641.0, + "reward": 2.5184152126312256, + "reward_std": 0.33801618218421936, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11721604317426682, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1083.790283203125, + "completions/mean_terminated_length": 841.3910522460938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6117947898353844, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12397367537784326, + "kl": 0.02496337890625, + "learning_rate": 4.541597671612176e-07, + "loss": 0.0661, + "num_tokens": 1600651347.0, + "reward": 2.3314733505249023, + "reward_std": 0.4692508280277252, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17012260854244232, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1012.37060546875, + "completions/mean_terminated_length": 769.8677978515625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6120078845026903, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15030125635043512, + "kl": 0.026519775390625, + "learning_rate": 4.5383270829971267e-07, + "loss": 0.1281, + "num_tokens": 1601180393.0, + "reward": 2.3911831378936768, + "reward_std": 0.42565271258354187, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14732414484024048, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1040.94873046875, + "completions/mean_terminated_length": 787.779296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6122209791699963, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13458235905205101, + "kl": 0.026153564453125, + "learning_rate": 4.5350570265944264e-07, + "loss": 0.109, + "num_tokens": 1601709794.0, + "reward": 2.4386162757873535, + "reward_std": 0.45369967818260193, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14601868391036987, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1053.5491943359375, + "completions/mean_terminated_length": 810.4611206054688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6124340738373022, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12401696865521841, + "kl": 0.027099609375, + "learning_rate": 4.531787504213803e-07, + "loss": 0.0801, + "num_tokens": 1602244168.0, + "reward": 2.404017925262451, + "reward_std": 0.42932647466659546, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13636787235736847, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1108.763427734375, + "completions/mean_terminated_length": 795.6845092773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6126471685046082, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12581406937974943, + "kl": 0.023773193359375, + "learning_rate": 4.5285185176646855e-07, + "loss": 0.0913, + "num_tokens": 1602813118.0, + "reward": 2.4330358505249023, + "reward_std": 0.34430232644081116, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11611521244049072, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1037.622802734375, + "completions/mean_terminated_length": 797.5884399414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6128602631719141, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14057259627528182, + "kl": 0.02569580078125, + "learning_rate": 4.5252500687562087e-07, + "loss": 0.0648, + "num_tokens": 1603351397.0, + "reward": 2.4419643878936768, + "reward_std": 0.5148780345916748, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14955389499664307, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1065.450927734375, + "completions/mean_terminated_length": 821.8662719726562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6130733578392201, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11995240492235462, + "kl": 0.02435302734375, + "learning_rate": 4.5219821592972075e-07, + "loss": 0.0708, + "num_tokens": 1603898607.0, + "reward": 2.3465402126312256, + "reward_std": 0.4355149567127228, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14732414484024048, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 869.029052734375, + "completions/mean_terminated_length": 704.0330810546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.613286452506526, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14101268544745005, + "kl": 0.03240966796875, + "learning_rate": 4.518714791096221e-07, + "loss": 0.0815, + "num_tokens": 1604359116.0, + "reward": 2.509486675262451, + "reward_std": 0.4213913381099701, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12909629940986633, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 937.4219360351562, + "completions/mean_terminated_length": 735.2322387695312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6134995471738319, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1295449103127536, + "kl": 0.030303955078125, + "learning_rate": 4.515447965961484e-07, + "loss": 0.049, + "num_tokens": 1604841769.0, + "reward": 2.545201063156128, + "reward_std": 0.4607691168785095, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14921021461486816, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1097.6004638671875, + "completions/mean_terminated_length": 841.8272094726562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6137126418411379, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1224310065558852, + "kl": 0.023651123046875, + "learning_rate": 4.512181685700939e-07, + "loss": 0.0618, + "num_tokens": 1605399158.0, + "reward": 2.4095983505249023, + "reward_std": 0.4590371549129486, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14409084618091583, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1104.4598388671875, + "completions/mean_terminated_length": 789.9464721679688, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6139257365084438, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12425075919754522, + "kl": 0.02557373046875, + "learning_rate": 4.508915952122219e-07, + "loss": 0.0908, + "num_tokens": 1605973428.0, + "reward": 2.279576063156128, + "reward_std": 0.4760548174381256, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16850323975086212, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1041.6763916015625, + "completions/mean_terminated_length": 799.1550903320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6141388311757499, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13368570307897557, + "kl": 0.02606201171875, + "learning_rate": 4.505650767032659e-07, + "loss": 0.0843, + "num_tokens": 1606514899.0, + "reward": 2.2572546005249023, + "reward_std": 0.4593934416770935, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.18073564767837524, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1029.5960693359375, + "completions/mean_terminated_length": 856.7598266601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6143519258430558, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12265985993196467, + "kl": 0.024810791015625, + "learning_rate": 4.50238613223929e-07, + "loss": 0.0847, + "num_tokens": 1607045294.0, + "reward": 2.416294813156128, + "reward_std": 0.40520042181015015, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1613858938217163, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1062.779052734375, + "completions/mean_terminated_length": 779.6695556640625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6145650205103618, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12744129719216457, + "kl": 0.0283203125, + "learning_rate": 4.4991220495488325e-07, + "loss": 0.1393, + "num_tokens": 1607588331.0, + "reward": 2.462611675262451, + "reward_std": 0.4095012843608856, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 915.7232666015625, + "completions/mean_terminated_length": 654.4285888671875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6147781151776677, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14409158958824078, + "kl": 0.029388427734375, + "learning_rate": 4.4958585207677134e-07, + "loss": 0.0496, + "num_tokens": 1608071743.0, + "reward": 2.420201063156128, + "reward_std": 0.4292536675930023, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14003422856330872, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 904.0000610351562, + "completions/mean_terminated_length": 716.7999877929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6149912098449736, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.140985485151728, + "kl": 0.027801513671875, + "learning_rate": 4.4925955477020395e-07, + "loss": 0.0663, + "num_tokens": 1608548703.0, + "reward": 2.5072546005249023, + "reward_std": 0.38039425015449524, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.10128699988126755, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 918.6250610351562, + "completions/mean_terminated_length": 747.3316040039062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6152043045122796, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14104766200789357, + "kl": 0.030120849609375, + "learning_rate": 4.489333132157622e-07, + "loss": 0.0487, + "num_tokens": 1609034935.0, + "reward": 2.4486608505249023, + "reward_std": 0.4112870395183563, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11321921646595001, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1141.888427734375, + "completions/mean_terminated_length": 832.6168212890625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6154173991795855, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11475268355449765, + "kl": 0.0240478515625, + "learning_rate": 4.4860712759399556e-07, + "loss": 0.0605, + "num_tokens": 1609617525.0, + "reward": 2.3236608505249023, + "reward_std": 0.4319365620613098, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12337145209312439, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1013.37060546875, + "completions/mean_terminated_length": 723.6742553710938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6156304938468915, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12897899610763983, + "kl": 0.02655029296875, + "learning_rate": 4.4828099808542265e-07, + "loss": 0.069, + "num_tokens": 1610139707.0, + "reward": 2.365513563156128, + "reward_std": 0.3626191318035126, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12493880838155746, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 898.0000610351562, + "completions/mean_terminated_length": 663.0537719726562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6158435885141974, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12901845046172786, + "kl": 0.029083251953125, + "learning_rate": 4.4795492487053155e-07, + "loss": 0.0567, + "num_tokens": 1610610299.0, + "reward": 2.5306921005249023, + "reward_std": 0.39017871022224426, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1208655834197998, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1005.7076416015625, + "completions/mean_terminated_length": 765.1785888671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6160566831815034, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.1103107014736137, + "kl": 0.026519775390625, + "learning_rate": 4.476289081297786e-07, + "loss": 0.0544, + "num_tokens": 1611135880.0, + "reward": 2.416294813156128, + "reward_std": 0.3600604236125946, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9877232313156128, + "rewards/tag_count_reward/std": 0.08601906895637512, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 897.1763916015625, + "completions/mean_terminated_length": 759.0774536132812, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6162697778488093, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12407218411372699, + "kl": 0.0294189453125, + "learning_rate": 4.4730294804358915e-07, + "loss": 0.0739, + "num_tokens": 1611602839.0, + "reward": 2.5831475257873535, + "reward_std": 0.3061332106590271, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10593781620264053, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1172.0023193359375, + "completions/mean_terminated_length": 873.009033203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6164828725161153, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.16323903551672525, + "kl": 0.02557373046875, + "learning_rate": 4.469770447923572e-07, + "loss": 0.0817, + "num_tokens": 1612206968.0, + "reward": 2.2734375, + "reward_std": 0.37254172563552856, + "rewards/accuracy_reward/mean": 0.37731480598449707, + "rewards/accuracy_reward/std": 0.4852766990661621, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14553913474082947, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 989.2232666015625, + "completions/mean_terminated_length": 796.4644165039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6166959671834212, + "frac_reward_zero_std": 0.3214285969734192, + "grad_norm": 0.10645483356306196, + "kl": 0.027008056640625, + "learning_rate": 4.4665119855644527e-07, + "loss": 0.0709, + "num_tokens": 1612717356.0, + "reward": 2.463169813156128, + "reward_std": 0.3140815496444702, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174960792064667, + "rewards/tag_count_reward/mean": 0.9877232313156128, + "rewards/tag_count_reward/std": 0.08921077847480774, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1075.3638916015625, + "completions/mean_terminated_length": 847.611572265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6169090618507271, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13148551061062969, + "kl": 0.025665283203125, + "learning_rate": 4.4632540951618423e-07, + "loss": 0.09, + "num_tokens": 1613270191.0, + "reward": 2.435826063156128, + "reward_std": 0.46753934025764465, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13802282512187958, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1079.325927734375, + "completions/mean_terminated_length": 825.560546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6171221565180331, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.126206586864275, + "kl": 0.023529052734375, + "learning_rate": 4.459996778518733e-07, + "loss": 0.104, + "num_tokens": 1613817217.0, + "reward": 2.40234375, + "reward_std": 0.49909600615501404, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15693362057209015, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 994.9107666015625, + "completions/mean_terminated_length": 762.4849853515625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.617335251185339, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1357382529194367, + "kl": 0.02984619140625, + "learning_rate": 4.456740037437803e-07, + "loss": 0.0963, + "num_tokens": 1614321753.0, + "reward": 2.3705358505249023, + "reward_std": 0.47359156608581543, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14714714884757996, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1042.009033203125, + "completions/mean_terminated_length": 806.4462890625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6175483458526451, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.391172870211858, + "kl": 0.025909423828125, + "learning_rate": 4.453483873721405e-07, + "loss": 0.0723, + "num_tokens": 1614864365.0, + "reward": 2.3482143878936768, + "reward_std": 0.44395479559898376, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16540852189064026, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1037.640625, + "completions/mean_terminated_length": 769.3530883789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.617761440519951, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11583628232390675, + "kl": 0.024871826171875, + "learning_rate": 4.4502282891715816e-07, + "loss": 0.0532, + "num_tokens": 1615402316.0, + "reward": 2.3158483505249023, + "reward_std": 0.32073545455932617, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11055814474821091, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 979.7745971679688, + "completions/mean_terminated_length": 781.9550170898438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.617974535187257, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1448420653891271, + "kl": 0.02825927734375, + "learning_rate": 4.4469732855900463e-07, + "loss": 0.0811, + "num_tokens": 1615923559.0, + "reward": 2.3856027126312256, + "reward_std": 0.397357314825058, + "rewards/accuracy_reward/mean": 0.47453704476356506, + "rewards/accuracy_reward/std": 0.49993017315864563, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.08974618464708328, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 927.1585083007812, + "completions/mean_terminated_length": 701.7882080078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6181876298545629, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14395714968330442, + "kl": 0.02587890625, + "learning_rate": 4.443718864778194e-07, + "loss": 0.1535, + "num_tokens": 1616405566.0, + "reward": 2.5518975257873535, + "reward_std": 0.4102056324481964, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11440251022577286, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 806.7455444335938, + "completions/mean_terminated_length": 657.7949829101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6184007245218689, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.1239173052614156, + "kl": 0.032806396484375, + "learning_rate": 4.4404650285371003e-07, + "loss": 0.0388, + "num_tokens": 1616834668.0, + "reward": 2.56640625, + "reward_std": 0.3254980742931366, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9905133843421936, + "rewards/tag_count_reward/std": 0.07129643112421036, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1069.5379638671875, + "completions/mean_terminated_length": 791.97998046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6186138191891748, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1223114495098053, + "kl": 0.02630615234375, + "learning_rate": 4.43721177866751e-07, + "loss": 0.0892, + "num_tokens": 1617380045.0, + "reward": 2.400669813156128, + "reward_std": 0.3762684762477875, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10537806153297424, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 879.8170166015625, + "completions/mean_terminated_length": 712.9336547851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6188269138564807, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13887514062787995, + "kl": 0.0308837890625, + "learning_rate": 4.433959116969854e-07, + "loss": 0.0744, + "num_tokens": 1617837259.0, + "reward": 2.505580425262451, + "reward_std": 0.4158722460269928, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1029.696533203125, + "completions/mean_terminated_length": 844.3060913085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6190400085237867, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1210792802205966, + "kl": 0.027587890625, + "learning_rate": 4.4307070452442263e-07, + "loss": 0.0554, + "num_tokens": 1618365251.0, + "reward": 2.4559152126312256, + "reward_std": 0.4400137662887573, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1046096533536911, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 943.3504638671875, + "completions/mean_terminated_length": 772.5283203125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6192531031910926, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12240365164399232, + "kl": 0.031097412109375, + "learning_rate": 4.4274555652904036e-07, + "loss": 0.0531, + "num_tokens": 1618850368.0, + "reward": 2.45703125, + "reward_std": 0.45271119475364685, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12153510004281998, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 945.1160888671875, + "completions/mean_terminated_length": 784.3375854492188, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.6194661978583986, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14737879200148382, + "kl": 0.0301513671875, + "learning_rate": 4.424204678907828e-07, + "loss": 0.1093, + "num_tokens": 1619335700.0, + "reward": 2.4955358505249023, + "reward_std": 0.42307984828948975, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13165414333343506, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 893.3817138671875, + "completions/mean_terminated_length": 754.8274536132812, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6196792925257045, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13789178621337891, + "kl": 0.03472900390625, + "learning_rate": 4.420954387895616e-07, + "loss": 0.094, + "num_tokens": 1619805007.0, + "reward": 2.4760046005249023, + "reward_std": 0.4313313066959381, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 980.6094360351562, + "completions/mean_terminated_length": 730.66943359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6198923871930105, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12767785544852975, + "kl": 0.02801513671875, + "learning_rate": 4.4177046940525584e-07, + "loss": 0.0738, + "num_tokens": 1620309712.0, + "reward": 2.3833706378936768, + "reward_std": 0.402037650346756, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14469929039478302, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 958.87060546875, + "completions/mean_terminated_length": 750.3137817382812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6201054818603164, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.130251396607149, + "kl": 0.026580810546875, + "learning_rate": 4.414455599177108e-07, + "loss": 0.1337, + "num_tokens": 1620806486.0, + "reward": 2.490513563156128, + "reward_std": 0.4547174870967865, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661912620067596, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 891.1428833007812, + "completions/mean_terminated_length": 719.0974731445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6203185765276223, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13619977988975016, + "kl": 0.029693603515625, + "learning_rate": 4.411207105067395e-07, + "loss": 0.0908, + "num_tokens": 1621270758.0, + "reward": 2.5401787757873535, + "reward_std": 0.42019975185394287, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11286582797765732, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 957.607177734375, + "completions/mean_terminated_length": 775.875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6205316711949284, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12758683090572753, + "kl": 0.02935791015625, + "learning_rate": 4.4079592135212086e-07, + "loss": 0.0415, + "num_tokens": 1621763206.0, + "reward": 2.447544813156128, + "reward_std": 0.3945702910423279, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.1151164099574089, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1139.6451416015625, + "completions/mean_terminated_length": 840.4539794921875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6207447658622343, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11907224792520814, + "kl": 0.022247314453125, + "learning_rate": 4.4047119263360077e-07, + "loss": 0.0456, + "num_tokens": 1622350391.0, + "reward": 2.3560268878936768, + "reward_std": 0.3523109555244446, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11828288435935974, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 928.0402221679688, + "completions/mean_terminated_length": 699.231201171875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6209578605295403, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14770157836194658, + "kl": 0.03192138671875, + "learning_rate": 4.4014652453089185e-07, + "loss": 0.0807, + "num_tokens": 1622836025.0, + "reward": 2.4693081378936768, + "reward_std": 0.42143645882606506, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11682131141424179, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 920.4397583007812, + "completions/mean_terminated_length": 778.7864379882812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6211709551968462, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14731422213787046, + "kl": 0.030609130859375, + "learning_rate": 4.39821917223673e-07, + "loss": 0.0643, + "num_tokens": 1623320030.0, + "reward": 2.4419643878936768, + "reward_std": 0.42775759100914, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11119430512189865, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 793.6652221679688, + "completions/mean_terminated_length": 663.9063720703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6213840498641522, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13074259016898343, + "kl": 0.03643798828125, + "learning_rate": 4.3949737089158977e-07, + "loss": 0.0513, + "num_tokens": 1623734520.0, + "reward": 2.5970983505249023, + "reward_std": 0.3793795704841614, + "rewards/accuracy_reward/mean": 0.6808035969734192, + "rewards/accuracy_reward/std": 0.4666863977909088, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12585100531578064, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1048.977783203125, + "completions/mean_terminated_length": 835.0948486328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6215971445314581, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11005979204053319, + "kl": 0.024169921875, + "learning_rate": 4.3917288571425314e-07, + "loss": 0.1006, + "num_tokens": 1624274462.0, + "reward": 2.400669813156128, + "reward_std": 0.37523284554481506, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9709821343421936, + "rewards/format_reward/std": 0.16804419457912445, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.12218689918518066, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1060.68310546875, + "completions/mean_terminated_length": 791.414794921875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.6218102391987641, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1280998146725062, + "kl": 0.02642822265625, + "learning_rate": 4.388484618712415e-07, + "loss": 0.1334, + "num_tokens": 1624815872.0, + "reward": 2.3800225257873535, + "reward_std": 0.47886040806770325, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15730705857276917, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 968.6027221679688, + "completions/mean_terminated_length": 782.1099853515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.62202333386607, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1176024413660939, + "kl": 0.027008056640625, + "learning_rate": 4.3852409954209836e-07, + "loss": 0.1117, + "num_tokens": 1625321726.0, + "reward": 2.4972100257873535, + "reward_std": 0.3666824400424957, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.09129084646701813, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 914.2500610351562, + "completions/mean_terminated_length": 745.6410522460938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6222364285333759, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14159414822526759, + "kl": 0.0291748046875, + "learning_rate": 4.38199798906333e-07, + "loss": 0.0798, + "num_tokens": 1625797454.0, + "reward": 2.5496652126312256, + "reward_std": 0.4418925344944, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.48100295662879944, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15107274055480957, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 934.857177734375, + "completions/mean_terminated_length": 718.1653442382812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6224495232006819, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14736922970767907, + "kl": 0.026611328125, + "learning_rate": 4.378755601434216e-07, + "loss": 0.0952, + "num_tokens": 1626284302.0, + "reward": 2.454799175262451, + "reward_std": 0.37240859866142273, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12245608121156693, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1090.13623046875, + "completions/mean_terminated_length": 862.577392578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6226626178679878, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11715308222447135, + "kl": 0.025146484375, + "learning_rate": 4.375513834328052e-07, + "loss": 0.0684, + "num_tokens": 1626840107.0, + "reward": 2.3431921005249023, + "reward_std": 0.45546308159828186, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1372518688440323, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 938.700927734375, + "completions/mean_terminated_length": 701.2086791992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6228757125352938, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1302772180264295, + "kl": 0.026824951171875, + "learning_rate": 4.3722726895389097e-07, + "loss": 0.0633, + "num_tokens": 1627325557.0, + "reward": 2.4263393878936768, + "reward_std": 0.3694835603237152, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13422717154026031, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 986.2813110351562, + "completions/mean_terminated_length": 748.4097900390625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.6230888072025997, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13089899546125877, + "kl": 0.02618408203125, + "learning_rate": 4.369032168860513e-07, + "loss": 0.1192, + "num_tokens": 1627844163.0, + "reward": 2.42578125, + "reward_std": 0.4136088490486145, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 918.0379638671875, + "completions/mean_terminated_length": 687.1854858398438, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.6233019018699058, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13922063077601252, + "kl": 0.031402587890625, + "learning_rate": 4.3657922740862416e-07, + "loss": 0.1143, + "num_tokens": 1628320484.0, + "reward": 2.4888393878936768, + "reward_std": 0.3887077271938324, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14672234654426575, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 924.0000610351562, + "completions/mean_terminated_length": 729.801025390625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6235149965372117, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11395968150981141, + "kl": 0.0279541015625, + "learning_rate": 4.362553007009131e-07, + "loss": 0.0392, + "num_tokens": 1628803156.0, + "reward": 2.50390625, + "reward_std": 0.33635610342025757, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.1038430780172348, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 863.1004638671875, + "completions/mean_terminated_length": 693.8290405273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6237280912045176, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15620490141293092, + "kl": 0.03082275390625, + "learning_rate": 4.359314369421866e-07, + "loss": 0.0986, + "num_tokens": 1629258449.0, + "reward": 2.3989956378936768, + "reward_std": 0.46517857909202576, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16737332940101624, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1020.904052734375, + "completions/mean_terminated_length": 814.3834228515625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6239411858718236, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13903351964370164, + "kl": 0.02783203125, + "learning_rate": 4.3560763631167876e-07, + "loss": 0.0926, + "num_tokens": 1629780918.0, + "reward": 2.4213171005249023, + "reward_std": 0.5255187153816223, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.18306908011436462, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 947.1785888671875, + "completions/mean_terminated_length": 707.8695678710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6241542805391295, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12844324080117073, + "kl": 0.02734375, + "learning_rate": 4.352838989885882e-07, + "loss": 0.0866, + "num_tokens": 1630269718.0, + "reward": 2.404017925262451, + "reward_std": 0.3857608139514923, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.18154285848140717, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1104.671875, + "completions/mean_terminated_length": 837.0802612304688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6243673752064355, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.3006043144557853, + "kl": 0.075439453125, + "learning_rate": 4.349602251520786e-07, + "loss": 0.0535, + "num_tokens": 1630855827.0, + "reward": 2.34375, + "reward_std": 0.46391230821609497, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14088857173919678, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1042.634033203125, + "completions/mean_terminated_length": 800.3434448242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6245804698737414, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1403942875675658, + "kl": 0.02410888671875, + "learning_rate": 4.34636614981279e-07, + "loss": 0.0545, + "num_tokens": 1631397999.0, + "reward": 2.373326063156128, + "reward_std": 0.39641252160072327, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14590215682983398, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 847.5848388671875, + "completions/mean_terminated_length": 703.5349731445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6247935645410474, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1352385194778094, + "kl": 0.031646728515625, + "learning_rate": 4.343130686552826e-07, + "loss": 0.0595, + "num_tokens": 1631850549.0, + "reward": 2.4972100257873535, + "reward_std": 0.4342727065086365, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1397307962179184, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 923.5938110351562, + "completions/mean_terminated_length": 742.9896240234375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6250066592083533, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.20393504962754053, + "kl": 0.027984619140625, + "learning_rate": 4.3398958635314764e-07, + "loss": 0.0791, + "num_tokens": 1632326399.0, + "reward": 2.4146206378936768, + "reward_std": 0.43712449073791504, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13572438061237335, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 985.935302734375, + "completions/mean_terminated_length": 775.7941284179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6252197538756593, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12234409285833742, + "kl": 0.028289794921875, + "learning_rate": 4.3366616825389666e-07, + "loss": 0.0734, + "num_tokens": 1632841490.0, + "reward": 2.390625, + "reward_std": 0.38307416439056396, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12951265275478363, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 959.8594360351562, + "completions/mean_terminated_length": 775.18798828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6254328485429652, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12935150974258922, + "kl": 0.02691650390625, + "learning_rate": 4.333428145365172e-07, + "loss": 0.0629, + "num_tokens": 1633338643.0, + "reward": 2.4347100257873535, + "reward_std": 0.41431596875190735, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780035495758057, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1006.685302734375, + "completions/mean_terminated_length": 783.7479858398438, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.6256459432102711, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1317130484853896, + "kl": 0.02923583984375, + "learning_rate": 4.3301952537996047e-07, + "loss": 0.0644, + "num_tokens": 1633856998.0, + "reward": 2.3861608505249023, + "reward_std": 0.4497280418872833, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15456201136112213, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1026.2388916015625, + "completions/mean_terminated_length": 793.8931884765625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6258590378775771, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.134650522901962, + "kl": 0.026824951171875, + "learning_rate": 4.3269630096314224e-07, + "loss": 0.083, + "num_tokens": 1634391601.0, + "reward": 2.4425225257873535, + "reward_std": 0.3779553472995758, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10811902582645416, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 889.919677734375, + "completions/mean_terminated_length": 727.8473510742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.626072132544883, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13141837685870725, + "kl": 0.032928466796875, + "learning_rate": 4.3237314146494275e-07, + "loss": 0.0529, + "num_tokens": 1634853597.0, + "reward": 2.5630581378936768, + "reward_std": 0.40721070766448975, + "rewards/accuracy_reward/mean": 0.6651785969734192, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13017487525939941, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1092.513427734375, + "completions/mean_terminated_length": 842.2027587890625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.626285227212189, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13281598774204922, + "kl": 0.02484130859375, + "learning_rate": 4.3205004706420565e-07, + "loss": 0.0761, + "num_tokens": 1635413091.0, + "reward": 2.338169813156128, + "reward_std": 0.4590991735458374, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17070867121219635, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 971.0313110351562, + "completions/mean_terminated_length": 804.4896850585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.626498321879495, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13274640130248275, + "kl": 0.029815673828125, + "learning_rate": 4.3172701793973953e-07, + "loss": 0.0403, + "num_tokens": 1635913153.0, + "reward": 2.439174175262451, + "reward_std": 0.4147360920906067, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13826683163642883, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1086.9442138671875, + "completions/mean_terminated_length": 828.3031005859375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.626711416546801, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13791095620338129, + "kl": 0.023162841796875, + "learning_rate": 4.314040542703158e-07, + "loss": 0.0393, + "num_tokens": 1636474296.0, + "reward": 2.3309152126312256, + "reward_std": 0.40279242396354675, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11757759749889374, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1017.638427734375, + "completions/mean_terminated_length": 776.369140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6269245112141069, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13334995685296192, + "kl": 0.027252197265625, + "learning_rate": 4.3108115623467024e-07, + "loss": 0.0646, + "num_tokens": 1637002470.0, + "reward": 2.4347100257873535, + "reward_std": 0.4832659959793091, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 895.325927734375, + "completions/mean_terminated_length": 720.4987182617188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6271376058814128, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14117247860426582, + "kl": 0.03204345703125, + "learning_rate": 4.3075832401150237e-07, + "loss": 0.0614, + "num_tokens": 1637471048.0, + "reward": 2.497767925262451, + "reward_std": 0.45960891246795654, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.17274779081344604, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 914.26123046875, + "completions/mean_terminated_length": 742.305908203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6273507005487188, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16033518372201397, + "kl": 0.031585693359375, + "learning_rate": 4.3043555777947483e-07, + "loss": 0.0951, + "num_tokens": 1637953709.0, + "reward": 2.446986675262451, + "reward_std": 0.4910048246383667, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15764795243740082, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1005.7500610351562, + "completions/mean_terminated_length": 812.74072265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6275637952160247, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.123985975829371, + "kl": 0.02734375, + "learning_rate": 4.3011285771721416e-07, + "loss": 0.0652, + "num_tokens": 1638475853.0, + "reward": 2.474330425262451, + "reward_std": 0.42471638321876526, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12473505735397339, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 987.5535888671875, + "completions/mean_terminated_length": 770.9031982421875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6277768898833307, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13642001030688675, + "kl": 0.029815673828125, + "learning_rate": 4.297902240033102e-07, + "loss": 0.1084, + "num_tokens": 1638985141.0, + "reward": 2.3487725257873535, + "reward_std": 0.3983851969242096, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13802284002304077, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1026.44873046875, + "completions/mean_terminated_length": 776.7361450195312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6279899845506366, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15240731021591938, + "kl": 0.03106689453125, + "learning_rate": 4.2946765681631605e-07, + "loss": 0.0767, + "num_tokens": 1639512750.0, + "reward": 2.3716518878936768, + "reward_std": 0.5022507309913635, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1642991006374359, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 978.9442138671875, + "completions/mean_terminated_length": 835.5012817382812, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6282030792179426, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14541485338549148, + "kl": 0.027587890625, + "learning_rate": 4.29145156334748e-07, + "loss": 0.0962, + "num_tokens": 1640015669.0, + "reward": 2.415736675262451, + "reward_std": 0.42475083470344543, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1021.0870971679688, + "completions/mean_terminated_length": 814.603271484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6284161738852485, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11826521985297453, + "kl": 0.025604248046875, + "learning_rate": 4.288227227370851e-07, + "loss": 0.086, + "num_tokens": 1640550332.0, + "reward": 2.4090402126312256, + "reward_std": 0.4001913368701935, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13700605928897858, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 958.6964721679688, + "completions/mean_terminated_length": 777.1458740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6286292685525545, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13759595384798867, + "kl": 0.02801513671875, + "learning_rate": 4.2850035620176994e-07, + "loss": 0.0695, + "num_tokens": 1641048196.0, + "reward": 2.4341518878936768, + "reward_std": 0.484378457069397, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16713166236877441, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1076.5826416015625, + "completions/mean_terminated_length": 808.128173828125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6288423632198604, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10844120930873578, + "kl": 0.022369384765625, + "learning_rate": 4.2817805690720744e-07, + "loss": 0.0528, + "num_tokens": 1641601257.0, + "reward": 2.3995537757873535, + "reward_std": 0.3386138677597046, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9665178656578064, + "rewards/format_reward/std": 0.18009299039840698, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10110348463058472, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 797.3660888671875, + "completions/mean_terminated_length": 694.656982421875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6290554578871663, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12832393414898433, + "kl": 0.0322265625, + "learning_rate": 4.27855825031766e-07, + "loss": 0.0788, + "num_tokens": 1642022173.0, + "reward": 2.6060268878936768, + "reward_std": 0.3501691222190857, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389531940221786, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 998.5067138671875, + "completions/mean_terminated_length": 800.8567504882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6292685525544723, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13658840187630403, + "kl": 0.026214599609375, + "learning_rate": 4.2753366075377595e-07, + "loss": 0.0842, + "num_tokens": 1642540400.0, + "reward": 2.5066964626312256, + "reward_std": 0.4364209473133087, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.14753690361976624, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1129.571533203125, + "completions/mean_terminated_length": 920.7233276367188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6294816472217782, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12549913842168856, + "kl": 0.023651123046875, + "learning_rate": 4.272115642515305e-07, + "loss": 0.0533, + "num_tokens": 1643121152.0, + "reward": 2.3080358505249023, + "reward_std": 0.46695148944854736, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.17342577874660492, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 903.0178833007812, + "completions/mean_terminated_length": 775.166259765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6296947418890843, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13263076842967625, + "kl": 0.027984619140625, + "learning_rate": 4.26889535703286e-07, + "loss": 0.0264, + "num_tokens": 1643590888.0, + "reward": 2.521205425262451, + "reward_std": 0.3816625773906708, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09310024231672287, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1034.3125, + "completions/mean_terminated_length": 807.2021484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6299078365563902, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1276400854732765, + "kl": 0.025390625, + "learning_rate": 4.265675752872597e-07, + "loss": 0.0679, + "num_tokens": 1644123220.0, + "reward": 2.4112725257873535, + "reward_std": 0.4529055953025818, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14635494351387024, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 988.8817138671875, + "completions/mean_terminated_length": 796.0607299804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6301209312236962, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13500087789665793, + "kl": 0.02618408203125, + "learning_rate": 4.262456831816329e-07, + "loss": 0.0767, + "num_tokens": 1644642815.0, + "reward": 2.3136162757873535, + "reward_std": 0.4059527516365051, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.10032878816127777, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 872.4285888671875, + "completions/mean_terminated_length": 741.1612548828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6303340258910021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14888490151885184, + "kl": 0.0325927734375, + "learning_rate": 4.259238595645476e-07, + "loss": 0.0619, + "num_tokens": 1645098143.0, + "reward": 2.517299175262451, + "reward_std": 0.4469611644744873, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13206008076667786, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1018.5491333007812, + "completions/mean_terminated_length": 808.231201171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6305471205583081, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1299084741473585, + "kl": 0.027984619140625, + "learning_rate": 4.2560210461410906e-07, + "loss": 0.0992, + "num_tokens": 1645621205.0, + "reward": 2.384486675262451, + "reward_std": 0.5200793147087097, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.18502935767173767, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 933.7388916015625, + "completions/mean_terminated_length": 713.2700805664062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.630760215225614, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.16559433291797698, + "kl": 0.0306396484375, + "learning_rate": 4.252804185083837e-07, + "loss": 0.0862, + "num_tokens": 1646121152.0, + "reward": 2.415736675262451, + "reward_std": 0.45371320843696594, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15371057391166687, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1016.0647583007812, + "completions/mean_terminated_length": 784.8660888671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6309733098929199, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13604954067442032, + "kl": 0.028472900390625, + "learning_rate": 4.2495880142540007e-07, + "loss": 0.1047, + "num_tokens": 1646646125.0, + "reward": 2.416294813156128, + "reward_std": 0.5145922899246216, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17598041892051697, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1002.013427734375, + "completions/mean_terminated_length": 760.6318969726562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6311864045602259, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12936300938247594, + "kl": 0.02728271484375, + "learning_rate": 4.2463725354314893e-07, + "loss": 0.0676, + "num_tokens": 1647166707.0, + "reward": 2.3973214626312256, + "reward_std": 0.467938095331192, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.16981780529022217, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 953.1451416015625, + "completions/mean_terminated_length": 753.8179931640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6313994992275318, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13145552105635308, + "kl": 0.0316162109375, + "learning_rate": 4.2431577503958217e-07, + "loss": 0.0881, + "num_tokens": 1647665908.0, + "reward": 2.455357313156128, + "reward_std": 0.4772571623325348, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18066051602363586, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1024.34375, + "completions/mean_terminated_length": 825.072021484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6316125938948378, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13959372094947137, + "kl": 0.0267333984375, + "learning_rate": 4.239943660926136e-07, + "loss": 0.1096, + "num_tokens": 1648192990.0, + "reward": 2.4603796005249023, + "reward_std": 0.43478232622146606, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11395422369241714, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 992.99560546875, + "completions/mean_terminated_length": 705.26708984375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6318256885621437, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1325381857666098, + "kl": 0.025482177734375, + "learning_rate": 4.2367302688011874e-07, + "loss": 0.0888, + "num_tokens": 1648710812.0, + "reward": 2.361049175262451, + "reward_std": 0.3898082673549652, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 968.1897583007812, + "completions/mean_terminated_length": 729.866455078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6320387832294497, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14864640967625145, + "kl": 0.029632568359375, + "learning_rate": 4.233517575799338e-07, + "loss": 0.1231, + "num_tokens": 1649216353.0, + "reward": 2.373326063156128, + "reward_std": 0.4889039397239685, + "rewards/accuracy_reward/mean": 0.5115740895271301, + "rewards/accuracy_reward/std": 0.5004456043243408, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16288256645202637, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1023.6629638671875, + "completions/mean_terminated_length": 797.5830688476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6322518778967556, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13637078549174073, + "kl": 0.029632568359375, + "learning_rate": 4.230305583698569e-07, + "loss": 0.0914, + "num_tokens": 1649740602.0, + "reward": 2.4056921005249023, + "reward_std": 0.45324209332466125, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16934573650360107, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1039.196533203125, + "completions/mean_terminated_length": 792.6000366210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6324649725640615, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12530244172115756, + "kl": 0.026702880859375, + "learning_rate": 4.227094294276473e-07, + "loss": 0.0763, + "num_tokens": 1650284770.0, + "reward": 2.4637277126312256, + "reward_std": 0.41138678789138794, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14895901083946228, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 939.5156860351562, + "completions/mean_terminated_length": 787.5913696289062, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.6326780672313675, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1416997760616862, + "kl": 0.02728271484375, + "learning_rate": 4.223883709310252e-07, + "loss": 0.0573, + "num_tokens": 1650774201.0, + "reward": 2.3950893878936768, + "reward_std": 0.3920544981956482, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09730304032564163, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1149.805908203125, + "completions/mean_terminated_length": 809.8738403320312, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6328911618986734, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11939228816286149, + "kl": 0.024169921875, + "learning_rate": 4.22067383057672e-07, + "loss": 0.0611, + "num_tokens": 1651354146.0, + "reward": 2.3208706378936768, + "reward_std": 0.5122708678245544, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.17529362440109253, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 997.497802734375, + "completions/mean_terminated_length": 806.2454223632812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6331042565659795, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.9931877282475534, + "kl": 0.03173828125, + "learning_rate": 4.217464659852299e-07, + "loss": 0.0791, + "num_tokens": 1651875329.0, + "reward": 2.443080425262451, + "reward_std": 0.4361160695552826, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12758491933345795, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1067.91748046875, + "completions/mean_terminated_length": 821.5278930664062, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6333173512332854, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11457908333525443, + "kl": 0.023529052734375, + "learning_rate": 4.21425619891302e-07, + "loss": 0.0364, + "num_tokens": 1652432812.0, + "reward": 2.3683037757873535, + "reward_std": 0.4116702377796173, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13629460334777832, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1016.6451416015625, + "completions/mean_terminated_length": 775.1432495117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6335304459005914, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14003342049147827, + "kl": 0.029815673828125, + "learning_rate": 4.2110484495345165e-07, + "loss": 0.0898, + "num_tokens": 1652966173.0, + "reward": 2.3325893878936768, + "reward_std": 0.48819270730018616, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067808747291565, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.16542361676692963, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1138.493408203125, + "completions/mean_terminated_length": 835.3244018554688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6337435405678973, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12875176609317993, + "kl": 0.02496337890625, + "learning_rate": 4.207841413492038e-07, + "loss": 0.0867, + "num_tokens": 1653545210.0, + "reward": 2.3822546005249023, + "reward_std": 0.4378181993961334, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13494952023029327, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 973.8906860351562, + "completions/mean_terminated_length": 740.3886108398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6339566352352033, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15653811729111977, + "kl": 0.029510498046875, + "learning_rate": 4.204635092560429e-07, + "loss": 0.1461, + "num_tokens": 1654047433.0, + "reward": 2.42578125, + "reward_std": 0.4407943785190582, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16000598669052124, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1114.2076416015625, + "completions/mean_terminated_length": 817.5911865234375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6341697299025092, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11354047711358363, + "kl": 0.0225830078125, + "learning_rate": 4.2014294885141476e-07, + "loss": 0.0123, + "num_tokens": 1654620742.0, + "reward": 2.299107313156128, + "reward_std": 0.39304473996162415, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1070.3817138671875, + "completions/mean_terminated_length": 886.2678833007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6343828245698151, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12367071681383417, + "kl": 0.024566650390625, + "learning_rate": 4.198224603127245e-07, + "loss": 0.084, + "num_tokens": 1655182785.0, + "reward": 2.228236675262451, + "reward_std": 0.4465470612049103, + "rewards/accuracy_reward/mean": 0.3325892984867096, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14732414484024048, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1063.328125, + "completions/mean_terminated_length": 805.371826171875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6345959192371211, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1226016190428056, + "kl": 0.025634765625, + "learning_rate": 4.195020438173381e-07, + "loss": 0.0527, + "num_tokens": 1655723540.0, + "reward": 2.3130581378936768, + "reward_std": 0.40664881467819214, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14869897067546844, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 997.2678833007812, + "completions/mean_terminated_length": 792.725341796875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.634809013904427, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13436095208757137, + "kl": 0.027313232421875, + "learning_rate": 4.19181699542582e-07, + "loss": 0.0892, + "num_tokens": 1656245676.0, + "reward": 2.4425225257873535, + "reward_std": 0.44571688771247864, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15371058881282806, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 957.5000610351562, + "completions/mean_terminated_length": 779.0545043945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.635022108571733, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1292508953312534, + "kl": 0.027984619140625, + "learning_rate": 4.188614276657416e-07, + "loss": 0.0614, + "num_tokens": 1656742396.0, + "reward": 2.419642925262451, + "reward_std": 0.4142501950263977, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10383255779743195, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 864.138427734375, + "completions/mean_terminated_length": 718.7518920898438, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.6352352032390389, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13721727498447991, + "kl": 0.03369140625, + "learning_rate": 4.185412283640634e-07, + "loss": 0.0899, + "num_tokens": 1657203802.0, + "reward": 2.560267925262451, + "reward_std": 0.43181318044662476, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1132.3482666015625, + "completions/mean_terminated_length": 852.0466918945312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.6354482979063449, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11258348807334408, + "kl": 0.023529052734375, + "learning_rate": 4.182211018147528e-07, + "loss": 0.0533, + "num_tokens": 1657782230.0, + "reward": 2.251674175262451, + "reward_std": 0.45014962553977966, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13978439569473267, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1030.3326416015625, + "completions/mean_terminated_length": 841.8756103515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6356613925736508, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10790544293355565, + "kl": 0.025634765625, + "learning_rate": 4.1790104819497575e-07, + "loss": 0.0537, + "num_tokens": 1658317547.0, + "reward": 2.3543527126312256, + "reward_std": 0.35733041167259216, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11938986927270889, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1005.2500610351562, + "completions/mean_terminated_length": 812.1481323242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6358744872409567, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12395280872237067, + "kl": 0.026214599609375, + "learning_rate": 4.175810676818571e-07, + "loss": 0.0883, + "num_tokens": 1658834731.0, + "reward": 2.412388563156128, + "reward_std": 0.3492583632469177, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692163944244, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 915.2723388671875, + "completions/mean_terminated_length": 729.9168701171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6360875819082628, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1347772509847454, + "kl": 0.03082275390625, + "learning_rate": 4.172611604524816e-07, + "loss": 0.078, + "num_tokens": 1659314821.0, + "reward": 2.55078125, + "reward_std": 0.3718591034412384, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11614610999822617, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 973.01123046875, + "completions/mean_terminated_length": 773.9391479492188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6363006765755687, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14009595911792153, + "kl": 0.026580810546875, + "learning_rate": 4.1694132668389357e-07, + "loss": 0.0983, + "num_tokens": 1659824234.0, + "reward": 2.4698662757873535, + "reward_std": 0.42816421389579773, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.14693914353847504, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 878.6785888671875, + "completions/mean_terminated_length": 728.4634399414062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6365137712428747, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12888184496619312, + "kl": 0.031707763671875, + "learning_rate": 4.166215665530964e-07, + "loss": 0.0545, + "num_tokens": 1660287930.0, + "reward": 2.4308037757873535, + "reward_std": 0.40837255120277405, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14955389499664307, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1041.5982666015625, + "completions/mean_terminated_length": 805.9393920898438, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.6367268659101806, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13356754723905398, + "kl": 0.02508544921875, + "learning_rate": 4.1630188023705306e-07, + "loss": 0.1032, + "num_tokens": 1660820950.0, + "reward": 2.455357313156128, + "reward_std": 0.5176732540130615, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.18192753195762634, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1006.4375610351562, + "completions/mean_terminated_length": 800.3529663085938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6369399605774866, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12813688435146575, + "kl": 0.02655029296875, + "learning_rate": 4.159822679126852e-07, + "loss": 0.081, + "num_tokens": 1661335754.0, + "reward": 2.4135046005249023, + "reward_std": 0.46410855650901794, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1035.515625, + "completions/mean_terminated_length": 801.8654174804688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6371530552447925, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12312479346631165, + "kl": 0.027130126953125, + "learning_rate": 4.1566272975687386e-07, + "loss": 0.0925, + "num_tokens": 1661874641.0, + "reward": 2.37109375, + "reward_std": 0.43238145112991333, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1310732364654541, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 979.9241333007812, + "completions/mean_terminated_length": 775.39892578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6373661499120985, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12362543819219723, + "kl": 0.02801513671875, + "learning_rate": 4.153432659464591e-07, + "loss": 0.0987, + "num_tokens": 1662379135.0, + "reward": 2.411830425262451, + "reward_std": 0.37004172801971436, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15116052329540253, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 735.7098388671875, + "completions/mean_terminated_length": 638.1535034179688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6375792445794044, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13963877361113958, + "kl": 0.036102294921875, + "learning_rate": 4.1502387665823915e-07, + "loss": 0.0979, + "num_tokens": 1662777293.0, + "reward": 2.5652902126312256, + "reward_std": 0.36679255962371826, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.0952264666557312, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 877.5178833007812, + "completions/mean_terminated_length": 699.9896850585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6377923392467103, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13946368758263236, + "kl": 0.030181884765625, + "learning_rate": 4.147045620689723e-07, + "loss": 0.0771, + "num_tokens": 1663235397.0, + "reward": 2.549107313156128, + "reward_std": 0.45275506377220154, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.1378791630268097, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1050.169677734375, + "completions/mean_terminated_length": 759.73486328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6380054339140163, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13047985088118638, + "kl": 0.02593994140625, + "learning_rate": 4.1438532235537417e-07, + "loss": 0.0699, + "num_tokens": 1663780209.0, + "reward": 2.3911831378936768, + "reward_std": 0.4369259476661682, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 969.6629638671875, + "completions/mean_terminated_length": 766.5808715820312, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6382185285813222, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12311184859582003, + "kl": 0.0267333984375, + "learning_rate": 4.1406615769411977e-07, + "loss": 0.0311, + "num_tokens": 1664282010.0, + "reward": 2.412388563156128, + "reward_std": 0.4343810975551605, + "rewards/accuracy_reward/mean": 0.5046296119689941, + "rewards/accuracy_reward/std": 0.5005582571029663, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.1137678399682045, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 779.0580444335938, + "completions/mean_terminated_length": 647.7881469726562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6384316232486282, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16943548910667572, + "kl": 0.03753662109375, + "learning_rate": 4.1374706826184225e-07, + "loss": 0.0858, + "num_tokens": 1664702772.0, + "reward": 2.58203125, + "reward_std": 0.4265348017215729, + "rewards/accuracy_reward/mean": 0.6763392686843872, + "rewards/accuracy_reward/std": 0.46839532256126404, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12493880838155746, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1107.15625, + "completions/mean_terminated_length": 826.2667236328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6386447179159341, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13644054718963786, + "kl": 0.024658203125, + "learning_rate": 4.1342805423513317e-07, + "loss": 0.108, + "num_tokens": 1665273226.0, + "reward": 2.2974331378936768, + "reward_std": 0.46128368377685547, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15764793753623962, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 898.9576416015625, + "completions/mean_terminated_length": 714.3963623046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6388578125832401, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13336070491661936, + "kl": 0.03216552734375, + "learning_rate": 4.131091157905423e-07, + "loss": 0.0359, + "num_tokens": 1665741879.0, + "reward": 2.51171875, + "reward_std": 0.4706316888332367, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.14720547199249268, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 871.2098388671875, + "completions/mean_terminated_length": 696.2000122070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.639070907250546, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14377150069264016, + "kl": 0.030792236328125, + "learning_rate": 4.127902531045778e-07, + "loss": 0.0572, + "num_tokens": 1666199253.0, + "reward": 2.482142925262451, + "reward_std": 0.37498924136161804, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14714714884757996, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1201.6317138671875, + "completions/mean_terminated_length": 909.3423461914062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.639284001917852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11887372512125888, + "kl": 0.021759033203125, + "learning_rate": 4.1247146635370567e-07, + "loss": 0.0931, + "num_tokens": 1666816496.0, + "reward": 2.283482313156128, + "reward_std": 0.519429087638855, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.17261767387390137, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 937.1897583007812, + "completions/mean_terminated_length": 752.0546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.639497096585158, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13358804861762266, + "kl": 0.02587890625, + "learning_rate": 4.1215275571435014e-07, + "loss": 0.07, + "num_tokens": 1667302181.0, + "reward": 2.3549108505249023, + "reward_std": 0.3632870614528656, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10013572871685028, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1167.5848388671875, + "completions/mean_terminated_length": 863.5375366210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6397101912524639, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1081429511955512, + "kl": 0.0240478515625, + "learning_rate": 4.1183412136289287e-07, + "loss": 0.0574, + "num_tokens": 1667901947.0, + "reward": 2.294642925262451, + "reward_std": 0.47484898567199707, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.175273135304451, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 919.2678833007812, + "completions/mean_terminated_length": 731.1458740234375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6399232859197699, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13105404150471958, + "kl": 0.0269775390625, + "learning_rate": 4.115155634756738e-07, + "loss": 0.0256, + "num_tokens": 1668383539.0, + "reward": 2.3984375, + "reward_std": 0.35650792717933655, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.1075822189450264, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1048.0648193359375, + "completions/mean_terminated_length": 793.1792602539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6401363805870758, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11780557426247855, + "kl": 0.0262451171875, + "learning_rate": 4.111970822289902e-07, + "loss": 0.0367, + "num_tokens": 1668930976.0, + "reward": 2.3755581378936768, + "reward_std": 0.41952234506607056, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.146857351064682, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 968.0670166015625, + "completions/mean_terminated_length": 747.4354858398438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6403494752543818, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11940137028206221, + "kl": 0.028045654296875, + "learning_rate": 4.1087867779909713e-07, + "loss": 0.0585, + "num_tokens": 1669441358.0, + "reward": 2.4107143878936768, + "reward_std": 0.4208201766014099, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13526484370231628, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 942.466552734375, + "completions/mean_terminated_length": 764.8937377929688, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6405625699216877, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.26227139932313126, + "kl": 0.02886962890625, + "learning_rate": 4.1056035036220716e-07, + "loss": 0.0512, + "num_tokens": 1669941551.0, + "reward": 2.36328125, + "reward_std": 0.4572998881340027, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 885.185302734375, + "completions/mean_terminated_length": 729.1620483398438, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6407756645889937, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13052642975409745, + "kl": 0.029083251953125, + "learning_rate": 4.102421000944899e-07, + "loss": 0.0947, + "num_tokens": 1670399810.0, + "reward": 2.4916296005249023, + "reward_std": 0.44786086678504944, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1339094191789627, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 963.716552734375, + "completions/mean_terminated_length": 731.5799560546875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6409887592562996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13308879468570403, + "kl": 0.029327392578125, + "learning_rate": 4.099239271720729e-07, + "loss": 0.0854, + "num_tokens": 1670898771.0, + "reward": 2.3822546005249023, + "reward_std": 0.4310140907764435, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14249980449676514, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1009.6272583007812, + "completions/mean_terminated_length": 797.486572265625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.6412018539236055, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13130962357303472, + "kl": 0.02606201171875, + "learning_rate": 4.0960583177104e-07, + "loss": 0.0816, + "num_tokens": 1671425388.0, + "reward": 2.4637277126312256, + "reward_std": 0.5023805499076843, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.13192766904830933, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 916.5938110351562, + "completions/mean_terminated_length": 761.5278930664062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6414149485909115, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14477767232149627, + "kl": 0.030242919921875, + "learning_rate": 4.092878140674333e-07, + "loss": 0.0998, + "num_tokens": 1671911622.0, + "reward": 2.400111675262451, + "reward_std": 0.40901997685432434, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10593781620264053, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 822.9420166015625, + "completions/mean_terminated_length": 644.3529663085938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6416280432582174, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14745979197921935, + "kl": 0.033050537109375, + "learning_rate": 4.089698742372506e-07, + "loss": 0.0973, + "num_tokens": 1672344988.0, + "reward": 2.3934152126312256, + "reward_std": 0.48596134781837463, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1810806840658188, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 929.8527221679688, + "completions/mean_terminated_length": 786.2115478515625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.6418411379255234, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14197540583233784, + "kl": 0.03289794921875, + "learning_rate": 4.086520124564479e-07, + "loss": 0.0734, + "num_tokens": 1672824474.0, + "reward": 2.462611675262451, + "reward_std": 0.41160017251968384, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1046096533536911, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 989.6942138671875, + "completions/mean_terminated_length": 734.6453857421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6420542325928293, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13739094661220477, + "kl": 0.027679443359375, + "learning_rate": 4.0833422890093684e-07, + "loss": 0.0814, + "num_tokens": 1673345745.0, + "reward": 2.474330425262451, + "reward_std": 0.3747579753398895, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12695714831352234, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 989.7879638671875, + "completions/mean_terminated_length": 793.82275390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6422673272601354, + "frac_reward_zero_std": 0.3214285969734192, + "grad_norm": 0.12136133372120773, + "kl": 0.025543212890625, + "learning_rate": 4.0801652374658644e-07, + "loss": 0.0366, + "num_tokens": 1673864866.0, + "reward": 2.34375, + "reward_std": 0.2922419011592865, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.19128035008907318, + "rewards/tag_count_reward/mean": 0.9888392686843872, + "rewards/tag_count_reward/std": 0.07941616326570511, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1012.5938110351562, + "completions/mean_terminated_length": 794.3189086914062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6424804219274413, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11656234838462415, + "kl": 0.02606201171875, + "learning_rate": 4.0769889716922247e-07, + "loss": 0.0654, + "num_tokens": 1674389180.0, + "reward": 2.4056921005249023, + "reward_std": 0.3991907238960266, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10962119698524475, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1086.24560546875, + "completions/mean_terminated_length": 861.0413208007812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6426935165947473, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15186825284309066, + "kl": 0.028350830078125, + "learning_rate": 4.0738134934462643e-07, + "loss": 0.073, + "num_tokens": 1674951178.0, + "reward": 2.4324777126312256, + "reward_std": 0.41315987706184387, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12245608866214752, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1045.6785888671875, + "completions/mean_terminated_length": 797.1921997070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6429066112620532, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12608514431960724, + "kl": 0.025177001953125, + "learning_rate": 4.070638804485371e-07, + "loss": 0.0635, + "num_tokens": 1675486458.0, + "reward": 2.2879464626312256, + "reward_std": 0.39923498034477234, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1034.946533203125, + "completions/mean_terminated_length": 787.3111572265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6431197059293591, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12085812751618726, + "kl": 0.02960205078125, + "learning_rate": 4.0674649065664925e-07, + "loss": 0.0499, + "num_tokens": 1676015522.0, + "reward": 2.4263393878936768, + "reward_std": 0.4028944671154022, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1136813834309578, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 925.1094360351562, + "completions/mean_terminated_length": 751.4664916992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6433328005966651, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14452928978424123, + "kl": 0.0291748046875, + "learning_rate": 4.064291801446136e-07, + "loss": 0.0825, + "num_tokens": 1676499539.0, + "reward": 2.4921875, + "reward_std": 0.43308025598526, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15116052329540253, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 906.6920166015625, + "completions/mean_terminated_length": 779.2506103515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.643545895263971, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1328273240657277, + "kl": 0.03228759765625, + "learning_rate": 4.0611194908803727e-07, + "loss": 0.0879, + "num_tokens": 1676974329.0, + "reward": 2.5072546005249023, + "reward_std": 0.40214458107948303, + "rewards/accuracy_reward/mean": 0.6064814925193787, + "rewards/accuracy_reward/std": 0.4890965521335602, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1327671855688095, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 945.3683471679688, + "completions/mean_terminated_length": 727.2005615234375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.643758989931277, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12488816663588193, + "kl": 0.0299072265625, + "learning_rate": 4.057947976624835e-07, + "loss": 0.0501, + "num_tokens": 1677470494.0, + "reward": 2.3565850257873535, + "reward_std": 0.40017497539520264, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14801737666130066, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 934.075927734375, + "completions/mean_terminated_length": 765.1259765625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6439720845985829, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11737828123114541, + "kl": 0.027679443359375, + "learning_rate": 4.0547772604347117e-07, + "loss": 0.0369, + "num_tokens": 1677960896.0, + "reward": 2.467076063156128, + "reward_std": 0.3626514673233032, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11801211535930634, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 878.63623046875, + "completions/mean_terminated_length": 731.7311401367188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6441851792658889, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.134153765149431, + "kl": 0.029449462890625, + "learning_rate": 4.0516073440647525e-07, + "loss": 0.0699, + "num_tokens": 1678431933.0, + "reward": 2.4776787757873535, + "reward_std": 0.4417393207550049, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1037.4263916015625, + "completions/mean_terminated_length": 817.7364501953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6443982739331948, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13568530261049974, + "kl": 0.027923583984375, + "learning_rate": 4.0484382292692643e-07, + "loss": 0.1408, + "num_tokens": 1678969788.0, + "reward": 2.4190850257873535, + "reward_std": 0.4392755627632141, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920445680618286, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 999.7232666015625, + "completions/mean_terminated_length": 802.3023681640625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.6446113686005007, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12795894670591043, + "kl": 0.028717041015625, + "learning_rate": 4.0452699178021076e-07, + "loss": 0.0696, + "num_tokens": 1679490176.0, + "reward": 2.4425225257873535, + "reward_std": 0.48326146602630615, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13180458545684814, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1081.43310546875, + "completions/mean_terminated_length": 871.309814453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6448244632678067, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1260343593187523, + "kl": 0.02630615234375, + "learning_rate": 4.0421024114167014e-07, + "loss": 0.0902, + "num_tokens": 1680040786.0, + "reward": 2.32421875, + "reward_std": 0.4389100670814514, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15279825031757355, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 934.4285888671875, + "completions/mean_terminated_length": 748.8333740234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6450375579351126, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1536566109088807, + "kl": 0.032318115234375, + "learning_rate": 4.038935711866019e-07, + "loss": 0.099, + "num_tokens": 1680532962.0, + "reward": 2.4324777126312256, + "reward_std": 0.4652644991874695, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12540756165981293, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 945.0357666015625, + "completions/mean_terminated_length": 747.6632080078125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6452506526024187, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13023979641259983, + "kl": 0.0281982421875, + "learning_rate": 4.035769820902584e-07, + "loss": 0.0833, + "num_tokens": 1681034914.0, + "reward": 2.3560268878936768, + "reward_std": 0.4603950083255768, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1781519651412964, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 925.0647583007812, + "completions/mean_terminated_length": 720.6253662109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6454637472697246, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12784006870300718, + "kl": 0.03033447265625, + "learning_rate": 4.032604740278478e-07, + "loss": 0.0729, + "num_tokens": 1681514447.0, + "reward": 2.5106027126312256, + "reward_std": 0.42938828468322754, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692163944244, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1056.029052734375, + "completions/mean_terminated_length": 820.367431640625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6456768419370306, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.48292507424810344, + "kl": 0.041961669921875, + "learning_rate": 4.0294404717453267e-07, + "loss": 0.0456, + "num_tokens": 1682052940.0, + "reward": 2.338169813156128, + "reward_std": 0.4858367443084717, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15247619152069092, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 984.3750610351562, + "completions/mean_terminated_length": 773.9251708984375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.6458899366043365, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12674738979150268, + "kl": 0.028472900390625, + "learning_rate": 4.0262770170543124e-07, + "loss": 0.0632, + "num_tokens": 1682563700.0, + "reward": 2.459263563156128, + "reward_std": 0.4306187033653259, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13878051936626434, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 929.185302734375, + "completions/mean_terminated_length": 735.8822021484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6461030312716425, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1448916631642093, + "kl": 0.03021240234375, + "learning_rate": 4.023114377956166e-07, + "loss": 0.1153, + "num_tokens": 1683042695.0, + "reward": 2.5011162757873535, + "reward_std": 0.38418418169021606, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12177752703428268, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1046.107177734375, + "completions/mean_terminated_length": 790.7227172851562, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6463161259389484, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1438695046424049, + "kl": 0.0260009765625, + "learning_rate": 4.019952556201165e-07, + "loss": 0.1086, + "num_tokens": 1683595239.0, + "reward": 2.3035714626312256, + "reward_std": 0.5223547220230103, + "rewards/accuracy_reward/mean": 0.46759259700775146, + "rewards/accuracy_reward/std": 0.49952712655067444, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17041954398155212, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1098.0692138671875, + "completions/mean_terminated_length": 869.1384887695312, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.6465292206062543, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11443706410760887, + "kl": 0.024505615234375, + "learning_rate": 4.016791553539137e-07, + "loss": 0.0755, + "num_tokens": 1684149590.0, + "reward": 2.4090402126312256, + "reward_std": 0.45912063121795654, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16318118572235107, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 979.5513916015625, + "completions/mean_terminated_length": 764.7158203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6467423152735603, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14979330264254792, + "kl": 0.032073974609375, + "learning_rate": 4.0136313717194524e-07, + "loss": 0.0806, + "num_tokens": 1684663677.0, + "reward": 2.4034600257873535, + "reward_std": 0.4430224299430847, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1071.53125, + "completions/mean_terminated_length": 878.3262329101562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6469554099408662, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13476884764052116, + "kl": 0.02734375, + "learning_rate": 4.010472012491034e-07, + "loss": 0.1281, + "num_tokens": 1685216379.0, + "reward": 2.2896206378936768, + "reward_std": 0.5698955655097961, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.1743151992559433, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1078.665283203125, + "completions/mean_terminated_length": 796.5244750976562, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6471685046081722, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10901786066654653, + "kl": 0.02227783203125, + "learning_rate": 4.0073134776023434e-07, + "loss": 0.0186, + "num_tokens": 1685770293.0, + "reward": 2.376674175262451, + "reward_std": 0.34220561385154724, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12130890041589737, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 899.74560546875, + "completions/mean_terminated_length": 718.7545166015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6473815992754781, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15556613632311114, + "kl": 0.030029296875, + "learning_rate": 4.004155768801385e-07, + "loss": 0.1193, + "num_tokens": 1686239715.0, + "reward": 2.4419643878936768, + "reward_std": 0.4940684139728546, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.170989990234375, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 884.3438110351562, + "completions/mean_terminated_length": 711.2872314453125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6475946939427841, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1563064627210784, + "kl": 0.031494140625, + "learning_rate": 4.0009988878357123e-07, + "loss": 0.0489, + "num_tokens": 1686697773.0, + "reward": 2.489955425262451, + "reward_std": 0.40380969643592834, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.10225148499011993, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 975.122802734375, + "completions/mean_terminated_length": 727.5357055664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.64780778861009, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14172186747070342, + "kl": 0.02911376953125, + "learning_rate": 3.9978428364524166e-07, + "loss": 0.1, + "num_tokens": 1687204932.0, + "reward": 2.3331475257873535, + "reward_std": 0.4661312699317932, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15279822051525116, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 942.0067138671875, + "completions/mean_terminated_length": 730.220703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6480208832773959, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13190541895586252, + "kl": 0.027496337890625, + "learning_rate": 3.9946876163981303e-07, + "loss": 0.0736, + "num_tokens": 1687696311.0, + "reward": 2.45703125, + "reward_std": 0.46261462569236755, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13359208405017853, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1049.2410888671875, + "completions/mean_terminated_length": 798.1563720703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.648233977944702, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.32176250662639433, + "kl": 0.030181884765625, + "learning_rate": 3.9915332294190287e-07, + "loss": 0.0758, + "num_tokens": 1688242467.0, + "reward": 2.263951063156128, + "reward_std": 0.49381107091903687, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17896561324596405, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 963.7053833007812, + "completions/mean_terminated_length": 766.3008422851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6484470726120078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12707168109074957, + "kl": 0.028472900390625, + "learning_rate": 3.988379677260818e-07, + "loss": 0.0645, + "num_tokens": 1688746399.0, + "reward": 2.385044813156128, + "reward_std": 0.5021392703056335, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.16051718592643738, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 874.1160888671875, + "completions/mean_terminated_length": 752.6798095703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6486601672793139, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13668350349310235, + "kl": 0.03082275390625, + "learning_rate": 3.985226961668754e-07, + "loss": 0.0975, + "num_tokens": 1689207411.0, + "reward": 2.5066964626312256, + "reward_std": 0.43065667152404785, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1168653815984726, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 896.24560546875, + "completions/mean_terminated_length": 745.0050659179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6488732619466198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1516798701860554, + "kl": 0.03009033203125, + "learning_rate": 3.982075084387617e-07, + "loss": 0.0945, + "num_tokens": 1689676881.0, + "reward": 2.5072546005249023, + "reward_std": 0.48922237753868103, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14732414484024048, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1146.5692138671875, + "completions/mean_terminated_length": 874.0435791015625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6490863566139258, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12001113937709317, + "kl": 0.02227783203125, + "learning_rate": 3.978924047161738e-07, + "loss": 0.0339, + "num_tokens": 1690272240.0, + "reward": 2.2527902126312256, + "reward_std": 0.42920854687690735, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11682131141424179, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1036.9398193359375, + "completions/mean_terminated_length": 823.7973022460938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6492994512812317, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12153460054745978, + "kl": 0.02630615234375, + "learning_rate": 3.975773851734967e-07, + "loss": 0.0736, + "num_tokens": 1690808165.0, + "reward": 2.23828125, + "reward_std": 0.4820942282676697, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 998.97998046875, + "completions/mean_terminated_length": 760.4356079101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6495125459485377, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1341528321485664, + "kl": 0.0277099609375, + "learning_rate": 3.972624499850703e-07, + "loss": 0.093, + "num_tokens": 1691327916.0, + "reward": 2.4129464626312256, + "reward_std": 0.49112632870674133, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14187753200531006, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 968.41748046875, + "completions/mean_terminated_length": 733.7255859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6497256406158436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15143826049676645, + "kl": 0.029449462890625, + "learning_rate": 3.9694759932518663e-07, + "loss": 0.1041, + "num_tokens": 1691828919.0, + "reward": 2.3856027126312256, + "reward_std": 0.5096012353897095, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.17171035706996918, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 956.0848388671875, + "completions/mean_terminated_length": 774.0989990234375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6499387352831495, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14919818462344184, + "kl": 0.0283203125, + "learning_rate": 3.966328333680914e-07, + "loss": 0.0992, + "num_tokens": 1692314589.0, + "reward": 2.3900671005249023, + "reward_std": 0.5258574485778809, + "rewards/accuracy_reward/mean": 0.5601851940155029, + "rewards/accuracy_reward/std": 0.496940016746521, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14461299777030945, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 967.3951416015625, + "completions/mean_terminated_length": 760.470703125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6501518299504555, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13839694249467505, + "kl": 0.02984619140625, + "learning_rate": 3.963181522879837e-07, + "loss": 0.0471, + "num_tokens": 1692812158.0, + "reward": 2.439174175262451, + "reward_std": 0.42624610662460327, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16374634206295013, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 932.8438110351562, + "completions/mean_terminated_length": 746.984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6503649246177614, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14212434964174786, + "kl": 0.029876708984375, + "learning_rate": 3.960035562590154e-07, + "loss": 0.0689, + "num_tokens": 1693297336.0, + "reward": 2.4458706378936768, + "reward_std": 0.4433249235153198, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14469929039478302, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1057.8304443359375, + "completions/mean_terminated_length": 819.2022094726562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6505780192850674, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12727239781152336, + "kl": 0.025146484375, + "learning_rate": 3.956890454552914e-07, + "loss": 0.0377, + "num_tokens": 1693845564.0, + "reward": 2.373326063156128, + "reward_std": 0.36856740713119507, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13335825502872467, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1008.7969360351562, + "completions/mean_terminated_length": 772.4849243164062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6507911139523733, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.5756412279788655, + "kl": 0.07489013671875, + "learning_rate": 3.953746200508693e-07, + "loss": 0.1053, + "num_tokens": 1694376721.0, + "reward": 2.3448662757873535, + "reward_std": 0.4737739562988281, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1042.294677734375, + "completions/mean_terminated_length": 789.4636840820312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6510042086196793, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13506602384235966, + "kl": 0.026031494140625, + "learning_rate": 3.950602802197591e-07, + "loss": 0.0438, + "num_tokens": 1694919525.0, + "reward": 2.3470983505249023, + "reward_std": 0.46464523673057556, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.10627460479736328, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1043.415283203125, + "completions/mean_terminated_length": 794.36767578125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6512173032869852, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.20849132946117305, + "kl": 0.048126220703125, + "learning_rate": 3.9474602613592454e-07, + "loss": 0.0759, + "num_tokens": 1695464399.0, + "reward": 2.40234375, + "reward_std": 0.4604419469833374, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1581934094429016, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 941.2678833007812, + "completions/mean_terminated_length": 711.5687255859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6514303979542911, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1366623426241431, + "kl": 0.02801513671875, + "learning_rate": 3.944318579732805e-07, + "loss": 0.0802, + "num_tokens": 1695954887.0, + "reward": 2.326451063156128, + "reward_std": 0.396142840385437, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 821.6116333007812, + "completions/mean_terminated_length": 688.0445556640625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.6516434926215972, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13574468995611583, + "kl": 0.030792236328125, + "learning_rate": 3.941177759056955e-07, + "loss": 0.047, + "num_tokens": 1696395305.0, + "reward": 2.5658483505249023, + "reward_std": 0.37883028388023376, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.08887429535388947, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 953.91748046875, + "completions/mean_terminated_length": 744.4122314453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.651856587288903, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14094800299288607, + "kl": 0.028350830078125, + "learning_rate": 3.938037801069898e-07, + "loss": 0.054, + "num_tokens": 1696900932.0, + "reward": 2.3638393878936768, + "reward_std": 0.34052330255508423, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9866071343421936, + "rewards/tag_count_reward/std": 0.09060624986886978, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1004.591552734375, + "completions/mean_terminated_length": 804.7898559570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6520696819562091, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12427462737778618, + "kl": 0.02813720703125, + "learning_rate": 3.9348987075093596e-07, + "loss": 0.0713, + "num_tokens": 1697420317.0, + "reward": 2.454799175262451, + "reward_std": 0.4182373285293579, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 930.6964721679688, + "completions/mean_terminated_length": 783.9797973632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.652282776623515, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14158414708276093, + "kl": 0.030609130859375, + "learning_rate": 3.93176048011259e-07, + "loss": 0.0928, + "num_tokens": 1697902149.0, + "reward": 2.455357313156128, + "reward_std": 0.49798741936683655, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1486160308122635, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 915.21435546875, + "completions/mean_terminated_length": 753.3877563476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.652495871290821, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13325849960618702, + "kl": 0.030303955078125, + "learning_rate": 3.928623120616353e-07, + "loss": 0.0765, + "num_tokens": 1698379669.0, + "reward": 2.4302456378936768, + "reward_std": 0.47848281264305115, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13566918671131134, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1104.8638916015625, + "completions/mean_terminated_length": 823.2898559570312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6527089659581269, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12039531062356246, + "kl": 0.026123046875, + "learning_rate": 3.9254866307569433e-07, + "loss": 0.0348, + "num_tokens": 1698938328.0, + "reward": 2.353236675262451, + "reward_std": 0.3765774667263031, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15332838892936707, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 942.83935546875, + "completions/mean_terminated_length": 768.640869140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6529220606254329, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12756613966517744, + "kl": 0.02801513671875, + "learning_rate": 3.922351012270162e-07, + "loss": 0.0787, + "num_tokens": 1699431712.0, + "reward": 2.48828125, + "reward_std": 0.44719505310058594, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.49405214190483093, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12870889902114868, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1030.1473388671875, + "completions/mean_terminated_length": 791.8071899414062, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6531351552927388, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1261735607909035, + "kl": 0.025970458984375, + "learning_rate": 3.919216266891339e-07, + "loss": 0.0856, + "num_tokens": 1699959842.0, + "reward": 2.4229912757873535, + "reward_std": 0.4391017258167267, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15610110759735107, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 941.529052734375, + "completions/mean_terminated_length": 773.7095336914062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6533482499600447, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13295850986629326, + "kl": 0.029754638671875, + "learning_rate": 3.9160823963553127e-07, + "loss": 0.0145, + "num_tokens": 1700452111.0, + "reward": 2.4246652126312256, + "reward_std": 0.41219159960746765, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1240563914179802, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1031.96875, + "completions/mean_terminated_length": 800.926025390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6535613446273507, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12957773420062366, + "kl": 0.025299072265625, + "learning_rate": 3.91294940239644e-07, + "loss": 0.0852, + "num_tokens": 1700990177.0, + "reward": 2.4793527126312256, + "reward_std": 0.40322282910346985, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780036240816116, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1109.654052734375, + "completions/mean_terminated_length": 860.4887084960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6537744392946566, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12357394879429899, + "kl": 0.023834228515625, + "learning_rate": 3.909817286748597e-07, + "loss": 0.0596, + "num_tokens": 1701560230.0, + "reward": 2.4609375, + "reward_std": 0.45786532759666443, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559327840805, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 937.3370971679688, + "completions/mean_terminated_length": 738.5868530273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6539875339619626, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13459269312145158, + "kl": 0.028900146484375, + "learning_rate": 3.906686051145166e-07, + "loss": 0.1433, + "num_tokens": 1702046589.0, + "reward": 2.404576063156128, + "reward_std": 0.4564734399318695, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.15706878900527954, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 909.7500610351562, + "completions/mean_terminated_length": 688.170654296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6542006286292685, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15120348125988642, + "kl": 0.031402587890625, + "learning_rate": 3.9035556973190484e-07, + "loss": 0.0631, + "num_tokens": 1702523101.0, + "reward": 2.4810268878936768, + "reward_std": 0.365992933511734, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12292034178972244, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1095.4129638671875, + "completions/mean_terminated_length": 881.9917602539062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6544137232965745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12144369579231543, + "kl": 0.02325439453125, + "learning_rate": 3.9004262270026543e-07, + "loss": 0.0242, + "num_tokens": 1703088006.0, + "reward": 2.2578125, + "reward_std": 0.4320054054260254, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.16310946643352509, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1036.529052734375, + "completions/mean_terminated_length": 823.300048828125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6546268179638804, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10870045352907412, + "kl": 0.02581787109375, + "learning_rate": 3.897297641927909e-07, + "loss": 0.088, + "num_tokens": 1703627971.0, + "reward": 2.4419643878936768, + "reward_std": 0.3953354060649872, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12083587795495987, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 895.8638916015625, + "completions/mean_terminated_length": 776.6773071289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6548399126311863, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.2536314090854561, + "kl": 0.039398193359375, + "learning_rate": 3.894169943826242e-07, + "loss": 0.0411, + "num_tokens": 1704106774.0, + "reward": 2.650669813156128, + "reward_std": 0.3904971778392792, + "rewards/accuracy_reward/mean": 0.7366071343421936, + "rewards/accuracy_reward/std": 0.44096609950065613, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11589459329843521, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 916.1428833007812, + "completions/mean_terminated_length": 724.05224609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6550530072984924, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.134044725451534, + "kl": 0.030120849609375, + "learning_rate": 3.891043134428593e-07, + "loss": 0.0941, + "num_tokens": 1704588534.0, + "reward": 2.3989956378936768, + "reward_std": 0.39790162444114685, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11088934540748596, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 987.8928833007812, + "completions/mean_terminated_length": 814.4207763671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6552661019657983, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12312706404244447, + "kl": 0.027008056640625, + "learning_rate": 3.8879172154654163e-07, + "loss": 0.0796, + "num_tokens": 1705096070.0, + "reward": 2.506138563156128, + "reward_std": 0.4249289333820343, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1258348822593689, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1078.71435546875, + "completions/mean_terminated_length": 851.74658203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6554791966331043, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1260331228238506, + "kl": 0.0240478515625, + "learning_rate": 3.8847921886666634e-07, + "loss": 0.097, + "num_tokens": 1705651174.0, + "reward": 2.353236675262451, + "reward_std": 0.4320808947086334, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16628077626228333, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 901.5469360351562, + "completions/mean_terminated_length": 757.5200805664062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6556922913004102, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12905305697282277, + "kl": 0.029510498046875, + "learning_rate": 3.881668055761803e-07, + "loss": 0.0864, + "num_tokens": 1706125259.0, + "reward": 2.5150671005249023, + "reward_std": 0.395595908164978, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.12055531144142151, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 950.7723388671875, + "completions/mean_terminated_length": 797.21630859375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6559053859677162, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1183443618691964, + "kl": 0.02716064453125, + "learning_rate": 3.8785448184798006e-07, + "loss": 0.1062, + "num_tokens": 1706623509.0, + "reward": 2.4693081378936768, + "reward_std": 0.4127633273601532, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.2463276982307434, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1119314506649971, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 989.0223388671875, + "completions/mean_terminated_length": 782.8746337890625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6561184806350221, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11486868110023093, + "kl": 0.027587890625, + "learning_rate": 3.8754224785491283e-07, + "loss": 0.0524, + "num_tokens": 1707138143.0, + "reward": 2.513392925262451, + "reward_std": 0.3562312424182892, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12733516097068787, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1059.122802734375, + "completions/mean_terminated_length": 803.5702514648438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6563315753023281, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11817245490244976, + "kl": 0.02557373046875, + "learning_rate": 3.872301037697766e-07, + "loss": 0.1182, + "num_tokens": 1707679286.0, + "reward": 2.3515625, + "reward_std": 0.36329910159111023, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 960.7969360351562, + "completions/mean_terminated_length": 779.5963745117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.656544669969634, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11746313667325094, + "kl": 0.029205322265625, + "learning_rate": 3.869180497653186e-07, + "loss": 0.0389, + "num_tokens": 1708177723.0, + "reward": 2.400111675262451, + "reward_std": 0.36465469002723694, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13170984387397766, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 892.107177734375, + "completions/mean_terminated_length": 763.0372314453125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6567577646369399, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1250789780598846, + "kl": 0.030181884765625, + "learning_rate": 3.866060860142375e-07, + "loss": 0.0682, + "num_tokens": 1708644411.0, + "reward": 2.6411831378936768, + "reward_std": 0.3977411389350891, + "rewards/accuracy_reward/mean": 0.7075892686843872, + "rewards/accuracy_reward/std": 0.4553784728050232, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.086386539041996, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 962.6473388671875, + "completions/mean_terminated_length": 751.3652954101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6569708593042459, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12350389008524336, + "kl": 0.02972412109375, + "learning_rate": 3.862942126891809e-07, + "loss": 0.0491, + "num_tokens": 1709141837.0, + "reward": 2.3353796005249023, + "reward_std": 0.39777690172195435, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1514935940504074, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1149.763427734375, + "completions/mean_terminated_length": 857.4378662109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6571839539715518, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1270818929851658, + "kl": 0.0235595703125, + "learning_rate": 3.859824299627469e-07, + "loss": 0.1207, + "num_tokens": 1709728435.0, + "reward": 2.2935268878936768, + "reward_std": 0.5091504454612732, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.16599240899085999, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1001.075927734375, + "completions/mean_terminated_length": 810.4749755859375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6573970486388578, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12347514829414148, + "kl": 0.02777099609375, + "learning_rate": 3.856707380074836e-07, + "loss": 0.0734, + "num_tokens": 1710244245.0, + "reward": 2.4034600257873535, + "reward_std": 0.4112948477268219, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15933358669281006, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 931.0848388671875, + "completions/mean_terminated_length": 764.9794921875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6576101433061637, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1441042502814212, + "kl": 0.02874755859375, + "learning_rate": 3.853591369958884e-07, + "loss": 0.1047, + "num_tokens": 1710726123.0, + "reward": 2.5150671005249023, + "reward_std": 0.4763098359107971, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13878051936626434, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1082.274658203125, + "completions/mean_terminated_length": 856.1405029296875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6578232379734698, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13085358210702872, + "kl": 0.0257568359375, + "learning_rate": 3.850476271004087e-07, + "loss": 0.0768, + "num_tokens": 1711287750.0, + "reward": 2.2974331378936768, + "reward_std": 0.47224316000938416, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15107274055480957, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 827.107177734375, + "completions/mean_terminated_length": 704.117919921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6580363326407757, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12891073604438358, + "kl": 0.03436279296875, + "learning_rate": 3.8473620849344127e-07, + "loss": 0.0059, + "num_tokens": 1711726934.0, + "reward": 2.588169813156128, + "reward_std": 0.40565165877342224, + "rewards/accuracy_reward/mean": 0.6808035969734192, + "rewards/accuracy_reward/std": 0.4666863977909088, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11468179523944855, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 963.2500610351562, + "completions/mean_terminated_length": 755.5319213867188, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.6582494273080817, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11382157880361331, + "kl": 0.02630615234375, + "learning_rate": 3.8442488134733276e-07, + "loss": 0.0075, + "num_tokens": 1712227366.0, + "reward": 2.5122768878936768, + "reward_std": 0.36341267824172974, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9899553656578064, + "rewards/tag_count_reward/std": 0.07219472527503967, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 919.6138916015625, + "completions/mean_terminated_length": 774.6574096679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6584625219753876, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12375196769218905, + "kl": 0.031982421875, + "learning_rate": 3.8411364583437876e-07, + "loss": 0.0507, + "num_tokens": 1712706649.0, + "reward": 2.470982313156128, + "reward_std": 0.38491514325141907, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1486160308122635, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 907.1741333007812, + "completions/mean_terminated_length": 723.9326171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6586756166426935, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1417443310356198, + "kl": 0.027923583984375, + "learning_rate": 3.838025021268241e-07, + "loss": 0.0731, + "num_tokens": 1713185959.0, + "reward": 2.4581475257873535, + "reward_std": 0.42494910955429077, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10500273108482361, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1062.625, + "completions/mean_terminated_length": 858.1131591796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6588887113099995, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1272366908118341, + "kl": 0.025482177734375, + "learning_rate": 3.8349145039686317e-07, + "loss": 0.0446, + "num_tokens": 1713730271.0, + "reward": 2.3565850257873535, + "reward_std": 0.4734190106391907, + "rewards/accuracy_reward/mean": 0.48842594027519226, + "rewards/accuracy_reward/std": 0.500445544719696, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.12992529571056366, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1078.732177734375, + "completions/mean_terminated_length": 821.35595703125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6591018059773054, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.8727786147800605, + "kl": 0.025543212890625, + "learning_rate": 3.831804908166393e-07, + "loss": 0.0516, + "num_tokens": 1714300007.0, + "reward": 2.3560268878936768, + "reward_std": 0.3809712529182434, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1401200145483017, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1038.328125, + "completions/mean_terminated_length": 791.5194702148438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.6593149006446114, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13565806163532326, + "kl": 0.02777099609375, + "learning_rate": 3.8286962355824495e-07, + "loss": 0.0931, + "num_tokens": 1714838922.0, + "reward": 2.4090402126312256, + "reward_std": 0.4496128559112549, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12450840324163437, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 889.5848388671875, + "completions/mean_terminated_length": 720.7109985351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6595279953119173, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13736879986117598, + "kl": 0.03167724609375, + "learning_rate": 3.825588487937211e-07, + "loss": 0.0386, + "num_tokens": 1715302832.0, + "reward": 2.482142925262451, + "reward_std": 0.40892651677131653, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13271193206310272, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1093.77685546875, + "completions/mean_terminated_length": 853.8882446289062, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6597410899792233, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12639586455013752, + "kl": 0.024322509765625, + "learning_rate": 3.82248166695058e-07, + "loss": 0.0959, + "num_tokens": 1715865532.0, + "reward": 2.3939733505249023, + "reward_std": 0.4523405432701111, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1479213982820511, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 904.7902221679688, + "completions/mean_terminated_length": 734.7743530273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6599541846465292, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12700185227354238, + "kl": 0.032806396484375, + "learning_rate": 3.819375774341944e-07, + "loss": 0.0374, + "num_tokens": 1716342062.0, + "reward": 2.4966518878936768, + "reward_std": 0.38179266452789307, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13911856710910797, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1004.0402221679688, + "completions/mean_terminated_length": 759.5867919921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6601672793138351, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13718327695824983, + "kl": 0.028594970703125, + "learning_rate": 3.8162708118301736e-07, + "loss": 0.096, + "num_tokens": 1716855728.0, + "reward": 2.4832589626312256, + "reward_std": 0.41527578234672546, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1599874496459961, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 968.22998046875, + "completions/mean_terminated_length": 804.4601440429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6603803739811411, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1286667466469969, + "kl": 0.027984619140625, + "learning_rate": 3.8131667811336334e-07, + "loss": 0.063, + "num_tokens": 1717351223.0, + "reward": 2.474330425262451, + "reward_std": 0.4458337724208832, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13189572095870972, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1020.3192138671875, + "completions/mean_terminated_length": 754.73876953125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.660593468648447, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12509926888190004, + "kl": 0.02569580078125, + "learning_rate": 3.8100636839701594e-07, + "loss": 0.098, + "num_tokens": 1717879510.0, + "reward": 2.2974331378936768, + "reward_std": 0.3995503783226013, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1477556824684143, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 895.9397583007812, + "completions/mean_terminated_length": 747.9420166015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.660806563315753, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12471956882233191, + "kl": 0.0284423828125, + "learning_rate": 3.806961522057087e-07, + "loss": 0.0318, + "num_tokens": 1718358507.0, + "reward": 2.4877233505249023, + "reward_std": 0.3503088355064392, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.0852610394358635, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 990.700927734375, + "completions/mean_terminated_length": 804.7716674804688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.661019657983059, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13476767129483264, + "kl": 0.0267333984375, + "learning_rate": 3.80386029711122e-07, + "loss": 0.0793, + "num_tokens": 1718873893.0, + "reward": 2.3716518878936768, + "reward_std": 0.487751841545105, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15208269655704498, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1101.450927734375, + "completions/mean_terminated_length": 846.7138671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.661232752650365, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1135335624883966, + "kl": 0.022247314453125, + "learning_rate": 3.8007600108488503e-07, + "loss": 0.1092, + "num_tokens": 1719446543.0, + "reward": 2.310267925262451, + "reward_std": 0.41257601976394653, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.49405214190483093, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17468811571598053, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1042.009033203125, + "completions/mean_terminated_length": 809.857177734375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6614458473176709, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14557508118414644, + "kl": 0.026214599609375, + "learning_rate": 3.797660664985748e-07, + "loss": 0.142, + "num_tokens": 1719984963.0, + "reward": 2.41796875, + "reward_std": 0.5324880480766296, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 931.9710083007812, + "completions/mean_terminated_length": 762.7017822265625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6616589419849769, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13670827050044176, + "kl": 0.030853271484375, + "learning_rate": 3.794562261237164e-07, + "loss": 0.0901, + "num_tokens": 1720472278.0, + "reward": 2.333705425262451, + "reward_std": 0.4587516784667969, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067808747291565, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1406535655260086, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1215.4241943359375, + "completions/mean_terminated_length": 927.89794921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6618720366522828, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11510770536061836, + "kl": 0.02447509765625, + "learning_rate": 3.7914648013178297e-07, + "loss": 0.0829, + "num_tokens": 1721088180.0, + "reward": 2.2393975257873535, + "reward_std": 0.4575740098953247, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.1611565798521042, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 852.872802734375, + "completions/mean_terminated_length": 712.7955322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6620851313195887, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14223276586948388, + "kl": 0.03076171875, + "learning_rate": 3.7883682869419507e-07, + "loss": 0.0674, + "num_tokens": 1721540091.0, + "reward": 2.578125, + "reward_std": 0.42145445942878723, + "rewards/accuracy_reward/mean": 0.6785714030265808, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10289046913385391, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1118.921875, + "completions/mean_terminated_length": 848.49853515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6622982259868947, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1263835314218856, + "kl": 0.022491455078125, + "learning_rate": 3.7852727198232104e-07, + "loss": 0.0589, + "num_tokens": 1722114984.0, + "reward": 2.326451063156128, + "reward_std": 0.417420357465744, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13903217017650604, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1041.6004638671875, + "completions/mean_terminated_length": 792.10302734375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6625113206542006, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13141432484372964, + "kl": 0.026580810546875, + "learning_rate": 3.782178101674768e-07, + "loss": 0.099, + "num_tokens": 1722649381.0, + "reward": 2.3253350257873535, + "reward_std": 0.5206719636917114, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.1875956654548645, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 950.6875610351562, + "completions/mean_terminated_length": 797.1195678710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6627244153215066, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11927302853964485, + "kl": 0.02825927734375, + "learning_rate": 3.7790844342092576e-07, + "loss": 0.0947, + "num_tokens": 1723144329.0, + "reward": 2.5418527126312256, + "reward_std": 0.4637902081012726, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12315750867128372, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 962.5535888671875, + "completions/mean_terminated_length": 768.3157958984375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6629375099888125, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13194249591660048, + "kl": 0.028717041015625, + "learning_rate": 3.775991719138789e-07, + "loss": 0.097, + "num_tokens": 1723643361.0, + "reward": 2.443080425262451, + "reward_std": 0.47731804847717285, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.16172590851783752, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1047.58935546875, + "completions/mean_terminated_length": 849.6470947265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6631506046561185, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11170216943161124, + "kl": 0.024505615234375, + "learning_rate": 3.772899958174942e-07, + "loss": 0.0696, + "num_tokens": 1724187289.0, + "reward": 2.40234375, + "reward_std": 0.3776506185531616, + "rewards/accuracy_reward/mean": 0.49074074625968933, + "rewards/accuracy_reward/std": 0.5004938244819641, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1256263703107834, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1036.234375, + "completions/mean_terminated_length": 806.1616821289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6633636993234244, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.7692815477780209, + "kl": 0.09539794921875, + "learning_rate": 3.7698091530287703e-07, + "loss": 0.107, + "num_tokens": 1724729058.0, + "reward": 2.40234375, + "reward_std": 0.44508540630340576, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1518804132938385, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 997.1428833007812, + "completions/mean_terminated_length": 782.4515991210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6635767939907303, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13817653175526767, + "kl": 0.02783203125, + "learning_rate": 3.7667193054107984e-07, + "loss": 0.0778, + "num_tokens": 1725243074.0, + "reward": 2.4369421005249023, + "reward_std": 0.45300424098968506, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13099703192710876, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 976.513427734375, + "completions/mean_terminated_length": 761.0670776367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6637898886580363, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13151625241229475, + "kl": 0.029815673828125, + "learning_rate": 3.7636304170310176e-07, + "loss": 0.0814, + "num_tokens": 1725746248.0, + "reward": 2.467076063156128, + "reward_std": 0.46433356404304504, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9425223469734192, + "rewards/tag_count_reward/std": 0.19041606783866882, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1006.0223388671875, + "completions/mean_terminated_length": 819.5631713867188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6640029833253422, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12057276742612635, + "kl": 0.026092529296875, + "learning_rate": 3.760542489598896e-07, + "loss": 0.0552, + "num_tokens": 1726264674.0, + "reward": 2.482142925262451, + "reward_std": 0.40842026472091675, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12040118128061295, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1057.890625, + "completions/mean_terminated_length": 802.0196533203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6642160779926483, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13355482734526317, + "kl": 0.025726318359375, + "learning_rate": 3.7574555248233574e-07, + "loss": 0.1035, + "num_tokens": 1726804641.0, + "reward": 2.37109375, + "reward_std": 0.4875785708427429, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15586401522159576, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 962.05810546875, + "completions/mean_terminated_length": 771.0918579101562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6644291726599542, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12193429456029223, + "kl": 0.03070068359375, + "learning_rate": 3.75436952441281e-07, + "loss": 0.0398, + "num_tokens": 1727303659.0, + "reward": 2.364955425262451, + "reward_std": 0.3756348490715027, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15063102543354034, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 932.1495971679688, + "completions/mean_terminated_length": 722.0026245117188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6646422673272602, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12381574597006693, + "kl": 0.028594970703125, + "learning_rate": 3.7512844900751126e-07, + "loss": 0.0559, + "num_tokens": 1727792190.0, + "reward": 2.4715402126312256, + "reward_std": 0.4454174041748047, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 996.52685546875, + "completions/mean_terminated_length": 808.3684692382812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6648553619945661, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11977573656996728, + "kl": 0.02752685546875, + "learning_rate": 3.7482004235175977e-07, + "loss": 0.0826, + "num_tokens": 1728317242.0, + "reward": 2.4151787757873535, + "reward_std": 0.4344754219055176, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14140157401561737, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 954.044677734375, + "completions/mean_terminated_length": 737.5935668945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6650684566618721, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 1.81741237290172, + "kl": 0.03192138671875, + "learning_rate": 3.745117326447059e-07, + "loss": 0.0658, + "num_tokens": 1728827198.0, + "reward": 2.4536831378936768, + "reward_std": 0.4123454988002777, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1034.575927734375, + "completions/mean_terminated_length": 772.6798095703125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.665281551329178, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1325237996618248, + "kl": 0.028045654296875, + "learning_rate": 3.742035200569755e-07, + "loss": 0.1099, + "num_tokens": 1729359824.0, + "reward": 2.255580425262451, + "reward_std": 0.437264621257782, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16713166236877441, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1044.950927734375, + "completions/mean_terminated_length": 830.2059936523438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6654946459964839, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.2223081589402521, + "kl": 0.025543212890625, + "learning_rate": 3.738954047591407e-07, + "loss": 0.0685, + "num_tokens": 1729899290.0, + "reward": 2.404576063156128, + "reward_std": 0.3951433598995209, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1395251452922821, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 965.4910888671875, + "completions/mean_terminated_length": 788.3532104492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6657077406637899, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12772861171746103, + "kl": 0.02581787109375, + "learning_rate": 3.7358738692171965e-07, + "loss": 0.0836, + "num_tokens": 1730401302.0, + "reward": 2.458705425262451, + "reward_std": 0.5159845352172852, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17990919947624207, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1069.12060546875, + "completions/mean_terminated_length": 833.2132568359375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6659208353310958, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12351318564293766, + "kl": 0.025970458984375, + "learning_rate": 3.7327946671517685e-07, + "loss": 0.0618, + "num_tokens": 1730950252.0, + "reward": 2.4017858505249023, + "reward_std": 0.4600284993648529, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1756715625524521, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 958.3192138671875, + "completions/mean_terminated_length": 770.0497436523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6661339299984018, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1201213837583874, + "kl": 0.030059814453125, + "learning_rate": 3.7297164430992244e-07, + "loss": 0.0512, + "num_tokens": 1731442603.0, + "reward": 2.4229912757873535, + "reward_std": 0.43238067626953125, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13915446400642395, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 940.5022583007812, + "completions/mean_terminated_length": 762.6139526367188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6663470246657077, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1322184789702175, + "kl": 0.027862548828125, + "learning_rate": 3.726639198763124e-07, + "loss": 0.0685, + "num_tokens": 1731931228.0, + "reward": 2.44140625, + "reward_std": 0.37991076707839966, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.1057254895567894, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1057.779052734375, + "completions/mean_terminated_length": 825.9091186523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6665601193330137, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12688346931750136, + "kl": 0.025360107421875, + "learning_rate": 3.723562935846489e-07, + "loss": 0.066, + "num_tokens": 1732472361.0, + "reward": 2.3515625, + "reward_std": 0.47722697257995605, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.1648755669593811, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 985.1563110351562, + "completions/mean_terminated_length": 788.3333129882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6667732140003196, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14010179889831353, + "kl": 0.02728271484375, + "learning_rate": 3.720487656051793e-07, + "loss": 0.0995, + "num_tokens": 1732989583.0, + "reward": 2.4229912757873535, + "reward_std": 0.48437464237213135, + "rewards/accuracy_reward/mean": 0.5717592835426331, + "rewards/accuracy_reward/std": 0.49539753794670105, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15964375436306, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1052.399658203125, + "completions/mean_terminated_length": 871.1425170898438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6669863086676255, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1249553346825921, + "kl": 0.025146484375, + "learning_rate": 3.71741336108097e-07, + "loss": 0.0983, + "num_tokens": 1733530466.0, + "reward": 2.5011162757873535, + "reward_std": 0.43405023217201233, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13810986280441284, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 983.872802734375, + "completions/mean_terminated_length": 806.5182495117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6671994033349316, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1252057133838651, + "kl": 0.027740478515625, + "learning_rate": 3.7143400526354065e-07, + "loss": 0.0961, + "num_tokens": 1734036121.0, + "reward": 2.501674175262451, + "reward_std": 0.4200673997402191, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16628827154636383, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1085.8192138671875, + "completions/mean_terminated_length": 867.0219116210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6674124980022375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12501167367196567, + "kl": 0.026641845703125, + "learning_rate": 3.7112677324159424e-07, + "loss": 0.0421, + "num_tokens": 1734597944.0, + "reward": 2.39453125, + "reward_std": 0.5232722759246826, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16374634206295013, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1002.966552734375, + "completions/mean_terminated_length": 812.7097778320312, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6676255926695435, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12504874351742548, + "kl": 0.02734375, + "learning_rate": 3.708196402122875e-07, + "loss": 0.0932, + "num_tokens": 1735126825.0, + "reward": 2.407924175262451, + "reward_std": 0.39183828234672546, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12651757895946503, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 970.0201416015625, + "completions/mean_terminated_length": 753.2681274414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6678386873368494, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14446529017385498, + "kl": 0.028900146484375, + "learning_rate": 3.7051260634559445e-07, + "loss": 0.0893, + "num_tokens": 1735629394.0, + "reward": 2.4603796005249023, + "reward_std": 0.40502676367759705, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12450841069221497, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1057.71875, + "completions/mean_terminated_length": 794.7626953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6680517820041554, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1137649200213167, + "kl": 0.02490234375, + "learning_rate": 3.702056718114355e-07, + "loss": 0.0343, + "num_tokens": 1736174900.0, + "reward": 2.4291296005249023, + "reward_std": 0.3963313400745392, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14052370190620422, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1019.2991333007812, + "completions/mean_terminated_length": 735.0142211914062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6682648766714613, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13589099142745445, + "kl": 0.0264892578125, + "learning_rate": 3.6989883677967483e-07, + "loss": 0.0849, + "num_tokens": 1736701914.0, + "reward": 2.2935268878936768, + "reward_std": 0.4701041281223297, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17498444020748138, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1011.7902221679688, + "completions/mean_terminated_length": 816.6419067382812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6684779713387673, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12305038708272101, + "kl": 0.02642822265625, + "learning_rate": 3.6959210142012274e-07, + "loss": 0.0551, + "num_tokens": 1737222876.0, + "reward": 2.4609375, + "reward_std": 0.4333146810531616, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15571676194667816, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 939.62060546875, + "completions/mean_terminated_length": 709.5795288085938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6686910660060732, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11888697442220018, + "kl": 0.029144287109375, + "learning_rate": 3.692854659025334e-07, + "loss": 0.0698, + "num_tokens": 1737706978.0, + "reward": 2.4927456378936768, + "reward_std": 0.4107138514518738, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416090607643127, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 966.24560546875, + "completions/mean_terminated_length": 755.6640014648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6689041606733791, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11980726782629948, + "kl": 0.02886962890625, + "learning_rate": 3.6897893039660597e-07, + "loss": 0.075, + "num_tokens": 1738204160.0, + "reward": 2.3950893878936768, + "reward_std": 0.40266507863998413, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 933.0335083007812, + "completions/mean_terminated_length": 750.5844116210938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6691172553406851, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13563677144536043, + "kl": 0.03057861328125, + "learning_rate": 3.6867249507198486e-07, + "loss": 0.1168, + "num_tokens": 1738681951.0, + "reward": 2.4073662757873535, + "reward_std": 0.4084078073501587, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1542993038892746, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 883.80810546875, + "completions/mean_terminated_length": 710.6718139648438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.669330350007991, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.24132016109008478, + "kl": 0.03216552734375, + "learning_rate": 3.6836616009825805e-07, + "loss": 0.0461, + "num_tokens": 1739153625.0, + "reward": 2.572544813156128, + "reward_std": 0.3973178565502167, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.2463276982307434, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13033421337604523, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 848.0223388671875, + "completions/mean_terminated_length": 733.5990600585938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.669543444675297, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13881914498148673, + "kl": 0.0347900390625, + "learning_rate": 3.680599256449589e-07, + "loss": 0.0371, + "num_tokens": 1739608083.0, + "reward": 2.4988839626312256, + "reward_std": 0.3701789081096649, + "rewards/accuracy_reward/mean": 0.5763888955116272, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174960792064667, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.09495897591114044, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 967.4107666015625, + "completions/mean_terminated_length": 770.6807861328125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6697565393426029, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1563214311190711, + "kl": 0.02960205078125, + "learning_rate": 3.677537918815646e-07, + "loss": 0.0865, + "num_tokens": 1740113691.0, + "reward": 2.3521206378936768, + "reward_std": 0.4870644509792328, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.14974473416805267, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 980.6339721679688, + "completions/mean_terminated_length": 769.5160522460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6699696340099089, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12406385365521579, + "kl": 0.02862548828125, + "learning_rate": 3.674477589774966e-07, + "loss": 0.0743, + "num_tokens": 1740628919.0, + "reward": 2.3872768878936768, + "reward_std": 0.36976245045661926, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13763901591300964, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1048.821533203125, + "completions/mean_terminated_length": 854.3146362304688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.6701827286772148, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12198801427599272, + "kl": 0.023773193359375, + "learning_rate": 3.6714182710212094e-07, + "loss": 0.046, + "num_tokens": 1741180087.0, + "reward": 2.450892925262451, + "reward_std": 0.4436436593532562, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11162012070417404, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 962.7120971679688, + "completions/mean_terminated_length": 708.581298828125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6703958233445209, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1450248931589689, + "kl": 0.029296875, + "learning_rate": 3.6683599642474716e-07, + "loss": 0.1163, + "num_tokens": 1741679782.0, + "reward": 2.3521206378936768, + "reward_std": 0.5184926390647888, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16142748296260834, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 895.029052734375, + "completions/mean_terminated_length": 759.892822265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6706089180118268, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1474702976756312, + "kl": 0.03076171875, + "learning_rate": 3.6653026711462966e-07, + "loss": 0.1044, + "num_tokens": 1742148227.0, + "reward": 2.4375, + "reward_std": 0.47774118185043335, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.152545765042305, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1013.05810546875, + "completions/mean_terminated_length": 798.2587280273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6708220126791327, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12896039261732112, + "kl": 0.02606201171875, + "learning_rate": 3.662246393409657e-07, + "loss": 0.0854, + "num_tokens": 1742668861.0, + "reward": 2.3744421005249023, + "reward_std": 0.38497596979141235, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1208655834197998, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1095.919677734375, + "completions/mean_terminated_length": 825.8452758789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6710351073464387, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12341173822859448, + "kl": 0.024566650390625, + "learning_rate": 3.65919113272897e-07, + "loss": 0.1024, + "num_tokens": 1743230521.0, + "reward": 2.2862725257873535, + "reward_std": 0.48895934224128723, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.170974463224411, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1039.3125, + "completions/mean_terminated_length": 789.2479248046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6712482020137446, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12922827712692214, + "kl": 0.025390625, + "learning_rate": 3.656136890795092e-07, + "loss": 0.0878, + "num_tokens": 1743770037.0, + "reward": 2.3666296005249023, + "reward_std": 0.43791985511779785, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14201714098453522, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1064.779052734375, + "completions/mean_terminated_length": 803.69775390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6714612966810506, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13503756607025763, + "kl": 0.026947021484375, + "learning_rate": 3.6530836692983056e-07, + "loss": 0.0639, + "num_tokens": 1744310994.0, + "reward": 2.390625, + "reward_std": 0.40812820196151733, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14816173911094666, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 890.3906860351562, + "completions/mean_terminated_length": 751.4774780273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6716743913483565, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1255107174827235, + "kl": 0.030853271484375, + "learning_rate": 3.650031469928342e-07, + "loss": 0.0595, + "num_tokens": 1744779201.0, + "reward": 2.4620537757873535, + "reward_std": 0.43585923314094543, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1457662582397461, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1068.477783203125, + "completions/mean_terminated_length": 858.7696533203125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.6718874860156625, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11058912215797607, + "kl": 0.02569580078125, + "learning_rate": 3.646980294374354e-07, + "loss": 0.0462, + "num_tokens": 1745332199.0, + "reward": 2.4810268878936768, + "reward_std": 0.40336552262306213, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.10171286761760712, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1075.341552734375, + "completions/mean_terminated_length": 840.9334716796875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6721005806829684, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11722160299189707, + "kl": 0.025604248046875, + "learning_rate": 3.6439301443249393e-07, + "loss": 0.1001, + "num_tokens": 1745879824.0, + "reward": 2.3699777126312256, + "reward_std": 0.5322286486625671, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.1867084950208664, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 974.0178833007812, + "completions/mean_terminated_length": 736.9808959960938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6723136753502743, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14085428645184883, + "kl": 0.028076171875, + "learning_rate": 3.6408810214681185e-07, + "loss": 0.0627, + "num_tokens": 1746383064.0, + "reward": 2.408482313156128, + "reward_std": 0.3791973292827606, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13989263772964478, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 988.1094360351562, + "completions/mean_terminated_length": 788.5013427734375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6725267700175803, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14533547741062058, + "kl": 0.0264892578125, + "learning_rate": 3.6378329274913475e-07, + "loss": 0.0914, + "num_tokens": 1746892025.0, + "reward": 2.424107313156128, + "reward_std": 0.43542560935020447, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12083587795495987, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 969.69873046875, + "completions/mean_terminated_length": 783.3953247070312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6727398646848862, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13387323037041368, + "kl": 0.02618408203125, + "learning_rate": 3.6347858640815175e-07, + "loss": 0.1273, + "num_tokens": 1747393426.0, + "reward": 2.4447546005249023, + "reward_std": 0.4634547531604767, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14248228073120117, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 901.4464721679688, + "completions/mean_terminated_length": 767.0623779296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6729529593521922, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14417524839413087, + "kl": 0.029449462890625, + "learning_rate": 3.631739832924938e-07, + "loss": 0.0881, + "num_tokens": 1747864442.0, + "reward": 2.4737725257873535, + "reward_std": 0.40823379158973694, + "rewards/accuracy_reward/mean": 0.5925925970077515, + "rewards/accuracy_reward/std": 0.49192148447036743, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14827017486095428, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1046.4754638671875, + "completions/mean_terminated_length": 835.34326171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6731660540194981, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12140019305384736, + "kl": 0.0250244140625, + "learning_rate": 3.62869483570736e-07, + "loss": 0.0704, + "num_tokens": 1748403871.0, + "reward": 2.3136162757873535, + "reward_std": 0.4314676821231842, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.10711704939603806, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 907.0089721679688, + "completions/mean_terminated_length": 723.7409057617188, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6733791486868042, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.135318506368615, + "kl": 0.029510498046875, + "learning_rate": 3.6256508741139555e-07, + "loss": 0.0715, + "num_tokens": 1748881683.0, + "reward": 2.4832589626312256, + "reward_std": 0.45923757553100586, + "rewards/accuracy_reward/mean": 0.5972222089767456, + "rewards/accuracy_reward/std": 0.4910254180431366, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14886364340782166, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1196.1138916015625, + "completions/mean_terminated_length": 894.9939575195312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.67359224335411, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13121319126883563, + "kl": 0.02392578125, + "learning_rate": 3.6226079498293206e-07, + "loss": 0.116, + "num_tokens": 1749484054.0, + "reward": 2.1941964626312256, + "reward_std": 0.5945513844490051, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.8660714030265808, + "rewards/format_reward/std": 0.34095630049705505, + "rewards/tag_count_reward/mean": 0.9352678656578064, + "rewards/tag_count_reward/std": 0.2121729850769043, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1026.76123046875, + "completions/mean_terminated_length": 804.7527465820312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6738053380214161, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15588622314227032, + "kl": 0.026519775390625, + "learning_rate": 3.6195660645374837e-07, + "loss": 0.1051, + "num_tokens": 1750014267.0, + "reward": 2.3409600257873535, + "reward_std": 0.48787736892700195, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15030226111412048, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 945.46435546875, + "completions/mean_terminated_length": 754.9738159179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.674018432688722, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13211332699650077, + "kl": 0.027587890625, + "learning_rate": 3.6165252199218967e-07, + "loss": 0.0843, + "num_tokens": 1750516347.0, + "reward": 2.4927456378936768, + "reward_std": 0.39208781719207764, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.11001905798912048, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1072.0826416015625, + "completions/mean_terminated_length": 830.1420288085938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6742315273560279, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14208125442396127, + "kl": 0.024261474609375, + "learning_rate": 3.6134854176654316e-07, + "loss": 0.116, + "num_tokens": 1751065664.0, + "reward": 2.3621652126312256, + "reward_std": 0.48551085591316223, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17739619314670563, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 996.4442138671875, + "completions/mean_terminated_length": 801.7116088867188, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.6744446220233339, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1304505816535144, + "kl": 0.027862548828125, + "learning_rate": 3.6104466594503867e-07, + "loss": 0.0629, + "num_tokens": 1751578791.0, + "reward": 2.4732143878936768, + "reward_std": 0.45345214009284973, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1540280133485794, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 958.94873046875, + "completions/mean_terminated_length": 764.0657958984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6746577166906398, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13166762557239953, + "kl": 0.02728271484375, + "learning_rate": 3.607408946958486e-07, + "loss": 0.0758, + "num_tokens": 1752087312.0, + "reward": 2.55859375, + "reward_std": 0.381770521402359, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9871651530265808, + "rewards/tag_count_reward/std": 0.08346119523048401, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1098.232177734375, + "completions/mean_terminated_length": 888.6103515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6748708113579458, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1878031091802857, + "kl": 0.026275634765625, + "learning_rate": 3.6043722818708646e-07, + "loss": 0.0801, + "num_tokens": 1752657720.0, + "reward": 2.375, + "reward_std": 0.439854234457016, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14721500873565674, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 957.716552734375, + "completions/mean_terminated_length": 755.8121337890625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6750839060252517, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13998441277369844, + "kl": 0.0269775390625, + "learning_rate": 3.6013366658680864e-07, + "loss": 0.1062, + "num_tokens": 1753149769.0, + "reward": 2.4029018878936768, + "reward_std": 0.4386986792087555, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005797147750854, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12585102021694183, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1056.57373046875, + "completions/mean_terminated_length": 821.0414428710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6752970006925577, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14049653958372124, + "kl": 0.0272216796875, + "learning_rate": 3.598302100630135e-07, + "loss": 0.1148, + "num_tokens": 1753686682.0, + "reward": 2.4146206378936768, + "reward_std": 0.4072682559490204, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.14072787761688232, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 937.9420166015625, + "completions/mean_terminated_length": 766.2835083007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6755100953598636, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1245964897125346, + "kl": 0.028106689453125, + "learning_rate": 3.59526858783641e-07, + "loss": 0.0587, + "num_tokens": 1754174752.0, + "reward": 2.4246652126312256, + "reward_std": 0.47704753279685974, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14052370190620422, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1004.513427734375, + "completions/mean_terminated_length": 811.2750854492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6757231900271695, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12159456367258138, + "kl": 0.027984619140625, + "learning_rate": 3.5922361291657243e-07, + "loss": 0.054, + "num_tokens": 1754702022.0, + "reward": 2.349888563156128, + "reward_std": 0.36837661266326904, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 931.1964721679688, + "completions/mean_terminated_length": 761.8097534179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6759362846944755, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12209497820257305, + "kl": 0.030975341796875, + "learning_rate": 3.589204726296317e-07, + "loss": 0.0491, + "num_tokens": 1755187726.0, + "reward": 2.4676339626312256, + "reward_std": 0.43713292479515076, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13661938905715942, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 924.8281860351562, + "completions/mean_terminated_length": 796.3059692382812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6761493793617814, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14193845854589848, + "kl": 0.03076171875, + "learning_rate": 3.5861743809058296e-07, + "loss": 0.104, + "num_tokens": 1755668721.0, + "reward": 2.5558037757873535, + "reward_std": 0.4540950357913971, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11119430512189865, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1044.6317138671875, + "completions/mean_terminated_length": 829.8184204101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6763624740290874, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12957226237998298, + "kl": 0.02691650390625, + "learning_rate": 3.5831450946713373e-07, + "loss": 0.0948, + "num_tokens": 1756207036.0, + "reward": 2.3627233505249023, + "reward_std": 0.44290974736213684, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.155877023935318, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 969.5469360351562, + "completions/mean_terminated_length": 773.205810546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6765755686963933, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11111187066789892, + "kl": 0.02490234375, + "learning_rate": 3.5801168692693085e-07, + "loss": 0.0276, + "num_tokens": 1756714913.0, + "reward": 2.474888563156128, + "reward_std": 0.4251917004585266, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.1125321090221405, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1076.325927734375, + "completions/mean_terminated_length": 756.2789306640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6767886633636994, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12143813758747042, + "kl": 0.023345947265625, + "learning_rate": 3.57708970637564e-07, + "loss": 0.0548, + "num_tokens": 1757270451.0, + "reward": 2.2036831378936768, + "reward_std": 0.39231953024864197, + "rewards/accuracy_reward/mean": 0.2790178656578064, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1506175547838211, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 941.2701416015625, + "completions/mean_terminated_length": 736.320068359375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6770017580310053, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1363581152598531, + "kl": 0.030059814453125, + "learning_rate": 3.574063607665633e-07, + "loss": 0.1031, + "num_tokens": 1757755084.0, + "reward": 2.5167412757873535, + "reward_std": 0.522887647151947, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17114688456058502, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 909.9420166015625, + "completions/mean_terminated_length": 766.9698486328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6772148526983113, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12932242145500158, + "kl": 0.03070068359375, + "learning_rate": 3.5710385748140006e-07, + "loss": 0.1044, + "num_tokens": 1758233634.0, + "reward": 2.572544813156128, + "reward_std": 0.4777713119983673, + "rewards/accuracy_reward/mean": 0.6964285969734192, + "rewards/accuracy_reward/std": 0.4603137671947479, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1717585176229477, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 989.3995971679688, + "completions/mean_terminated_length": 809.7415161132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6774279473656172, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12157252632002181, + "kl": 0.02471923828125, + "learning_rate": 3.568014609494867e-07, + "loss": 0.0454, + "num_tokens": 1758748789.0, + "reward": 2.4073662757873535, + "reward_std": 0.4419124126434326, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.14308229088783264, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 901.154052734375, + "completions/mean_terminated_length": 727.2108154296875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6776410420329231, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.131107924792799, + "kl": 0.031494140625, + "learning_rate": 3.5649917133817653e-07, + "loss": 0.0688, + "num_tokens": 1759219274.0, + "reward": 2.4419643878936768, + "reward_std": 0.4218176603317261, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13636787235736847, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 965.7120971679688, + "completions/mean_terminated_length": 755.0266723632812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6778541367002291, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13279159014723266, + "kl": 0.025054931640625, + "learning_rate": 3.56196988814764e-07, + "loss": 0.065, + "num_tokens": 1759720745.0, + "reward": 2.3839287757873535, + "reward_std": 0.38550058007240295, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10779669135808945, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 958.7410888671875, + "completions/mean_terminated_length": 780.4986572265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.678067231367535, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12068206797649661, + "kl": 0.028717041015625, + "learning_rate": 3.558949135464837e-07, + "loss": 0.099, + "num_tokens": 1760216677.0, + "reward": 2.3565850257873535, + "reward_std": 0.41873735189437866, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1091761365532875, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1044.07373046875, + "completions/mean_terminated_length": 808.9945068359375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.678280326034841, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13654134308314955, + "kl": 0.02484130859375, + "learning_rate": 3.5559294570051135e-07, + "loss": 0.0614, + "num_tokens": 1760757078.0, + "reward": 2.3113839626312256, + "reward_std": 0.3459184765815735, + "rewards/accuracy_reward/mean": 0.4027777910232544, + "rewards/accuracy_reward/std": 0.4910254180431366, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11632467061281204, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 875.8482666015625, + "completions/mean_terminated_length": 738.4638671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6784934207021469, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.3916316059081557, + "kl": 0.0340576171875, + "learning_rate": 3.55291085443963e-07, + "loss": 0.0822, + "num_tokens": 1761227714.0, + "reward": 2.509486675262451, + "reward_std": 0.3954867720603943, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1477556824684143, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 986.4910888671875, + "completions/mean_terminated_length": 741.5274658203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6787065153694529, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.18603528859447718, + "kl": 0.02691650390625, + "learning_rate": 3.549893329438951e-07, + "loss": 0.0892, + "num_tokens": 1761742926.0, + "reward": 2.3856027126312256, + "reward_std": 0.39618343114852905, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11338312178850174, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1019.46435546875, + "completions/mean_terminated_length": 789.0272827148438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6789196100367588, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13418443469941863, + "kl": 0.02874755859375, + "learning_rate": 3.5468768836730465e-07, + "loss": 0.0638, + "num_tokens": 1762261694.0, + "reward": 2.37109375, + "reward_std": 0.3599708378314972, + "rewards/accuracy_reward/mean": 0.4722222089767456, + "rewards/accuracy_reward/std": 0.49980661273002625, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12338031083345413, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 989.40185546875, + "completions/mean_terminated_length": 806.5026245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6791327047040647, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13494690325357142, + "kl": 0.026580810546875, + "learning_rate": 3.543861518811286e-07, + "loss": 0.0688, + "num_tokens": 1762775586.0, + "reward": 2.4497768878936768, + "reward_std": 0.45355990529060364, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709373772144318, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 951.2567138671875, + "completions/mean_terminated_length": 727.1908569335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6793457993713707, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12377140203959655, + "kl": 0.026611328125, + "learning_rate": 3.5408472365224474e-07, + "loss": 0.0851, + "num_tokens": 1763270677.0, + "reward": 2.3934152126312256, + "reward_std": 0.36327415704727173, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12892211973667145, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 827.3080444335938, + "completions/mean_terminated_length": 701.029541015625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6795588940386766, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12787719298349856, + "kl": 0.03350830078125, + "learning_rate": 3.5378340384747027e-07, + "loss": 0.0395, + "num_tokens": 1763711391.0, + "reward": 2.65234375, + "reward_std": 0.3463893234729767, + "rewards/accuracy_reward/mean": 0.7209821343421936, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10092891752719879, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 925.9397583007812, + "completions/mean_terminated_length": 749.0775146484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6797719887059827, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.146702027068851, + "kl": 0.030303955078125, + "learning_rate": 3.534821926335627e-07, + "loss": 0.1066, + "num_tokens": 1764191220.0, + "reward": 2.5050225257873535, + "reward_std": 0.41286221146583557, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15423759818077087, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 971.38623046875, + "completions/mean_terminated_length": 772.01318359375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6799850833732886, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14519756827273508, + "kl": 0.027008056640625, + "learning_rate": 3.5318109017721933e-07, + "loss": 0.0547, + "num_tokens": 1764696881.0, + "reward": 2.5145089626312256, + "reward_std": 0.3931633532047272, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1312885582447052, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 885.2969360351562, + "completions/mean_terminated_length": 708.9486083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6801981780405946, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.22048546292659177, + "kl": 0.063262939453125, + "learning_rate": 3.528800966450771e-07, + "loss": 0.1137, + "num_tokens": 1765163686.0, + "reward": 2.509486675262451, + "reward_std": 0.37478265166282654, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.09911917895078659, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1027.805908203125, + "completions/mean_terminated_length": 812.7378540039062, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6804112727079005, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12321496379054767, + "kl": 0.0252685546875, + "learning_rate": 3.5257921220371365e-07, + "loss": 0.0804, + "num_tokens": 1765692143.0, + "reward": 2.443638563156128, + "reward_std": 0.39287397265434265, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780035495758057, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 987.1116333007812, + "completions/mean_terminated_length": 763.4649047851562, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6806243673752065, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12586450994243412, + "kl": 0.02716064453125, + "learning_rate": 3.522784370196444e-07, + "loss": 0.0677, + "num_tokens": 1766204465.0, + "reward": 2.4732143878936768, + "reward_std": 0.390165776014328, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383605539798737, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 950.8951416015625, + "completions/mean_terminated_length": 781.2396850585938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6808374620425124, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13326212776539978, + "kl": 0.0296630859375, + "learning_rate": 3.5197777125932636e-07, + "loss": 0.0837, + "num_tokens": 1766694898.0, + "reward": 2.4771206378936768, + "reward_std": 0.3968465030193329, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15578390657901764, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1104.6920166015625, + "completions/mean_terminated_length": 847.4261474609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6810505567098183, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11076158700780805, + "kl": 0.023529052734375, + "learning_rate": 3.516772150891545e-07, + "loss": 0.1215, + "num_tokens": 1767261656.0, + "reward": 2.342076063156128, + "reward_std": 0.5290426015853882, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16933098435401917, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1049.5067138671875, + "completions/mean_terminated_length": 861.4615478515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6812636513771243, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15210716919171763, + "kl": 0.02923583984375, + "learning_rate": 3.513767686754638e-07, + "loss": 0.0655, + "num_tokens": 1767798539.0, + "reward": 2.4107143878936768, + "reward_std": 0.4396968483924866, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12951265275478363, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1040.5201416015625, + "completions/mean_terminated_length": 728.26025390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6814767460444302, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12789386381105736, + "kl": 0.025634765625, + "learning_rate": 3.510764321845283e-07, + "loss": 0.0746, + "num_tokens": 1768332628.0, + "reward": 2.3392858505249023, + "reward_std": 0.3575405776500702, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.4966535270214081, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 927.6317138671875, + "completions/mean_terminated_length": 730.611572265625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6816898407117362, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13260031680391898, + "kl": 0.02862548828125, + "learning_rate": 3.5077620578256116e-07, + "loss": 0.0483, + "num_tokens": 1768820783.0, + "reward": 2.423549175262451, + "reward_std": 0.43929746747016907, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.17819663882255554, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1029.430908203125, + "completions/mean_terminated_length": 827.895751953125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.6819029353790421, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11829040646104894, + "kl": 0.0252685546875, + "learning_rate": 3.5047608963571517e-07, + "loss": 0.0304, + "num_tokens": 1769352944.0, + "reward": 2.4051339626312256, + "reward_std": 0.35791248083114624, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11345604062080383, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1051.4732666015625, + "completions/mean_terminated_length": 834.8369750976562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6821160300463481, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11946690477040675, + "kl": 0.0252685546875, + "learning_rate": 3.501760839100809e-07, + "loss": 0.0345, + "num_tokens": 1769893252.0, + "reward": 2.4034600257873535, + "reward_std": 0.4048386812210083, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1037.009033203125, + "completions/mean_terminated_length": 827.1806030273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.682329124713654, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.126176189986573, + "kl": 0.0262451171875, + "learning_rate": 3.498761887716892e-07, + "loss": 0.0803, + "num_tokens": 1770426136.0, + "reward": 2.3582589626312256, + "reward_std": 0.4287581741809845, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.15854518115520477, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 937.2031860351562, + "completions/mean_terminated_length": 765.4303588867188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.68254221938096, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12766465159262708, + "kl": 0.02813720703125, + "learning_rate": 3.495764043865088e-07, + "loss": 0.0819, + "num_tokens": 1770915731.0, + "reward": 2.4933037757873535, + "reward_std": 0.3616020083427429, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.1090860590338707, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1004.8147583007812, + "completions/mean_terminated_length": 814.8944702148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.682755314048266, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12618881296051956, + "kl": 0.026397705078125, + "learning_rate": 3.4927673092044753e-07, + "loss": 0.0692, + "num_tokens": 1771436288.0, + "reward": 2.575892925262451, + "reward_std": 0.4124738276004791, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11611521244049072, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 902.49560546875, + "completions/mean_terminated_length": 728.7557983398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6829684087155719, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12493146058500157, + "kl": 0.03143310546875, + "learning_rate": 3.4897716853935166e-07, + "loss": 0.0256, + "num_tokens": 1771913310.0, + "reward": 2.626674175262451, + "reward_std": 0.37074634432792664, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.45739173889160156, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1293378323316574, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 968.1741333007812, + "completions/mean_terminated_length": 807.5846557617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6831815033828779, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12342937569767037, + "kl": 0.026214599609375, + "learning_rate": 3.4867771740900574e-07, + "loss": 0.0733, + "num_tokens": 1772417052.0, + "reward": 2.5396206378936768, + "reward_std": 0.41366440057754517, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10744724422693253, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 919.6160888671875, + "completions/mean_terminated_length": 761.6997680664062, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6833945980501838, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12483192138286489, + "kl": 0.030303955078125, + "learning_rate": 3.4837837769513356e-07, + "loss": 0.0759, + "num_tokens": 1772897728.0, + "reward": 2.4921875, + "reward_std": 0.4152313768863678, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14495466649532318, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 899.44873046875, + "completions/mean_terminated_length": 755.1582641601562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6836076927174898, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12199326708025983, + "kl": 0.0306396484375, + "learning_rate": 3.4807914956339667e-07, + "loss": 0.1006, + "num_tokens": 1773370281.0, + "reward": 2.5167412757873535, + "reward_std": 0.45422816276550293, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1361067146062851, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1012.6719360351562, + "completions/mean_terminated_length": 801.1531982421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6838207873847957, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.21942022406132486, + "kl": 0.02984619140625, + "learning_rate": 3.477800331793944e-07, + "loss": 0.0623, + "num_tokens": 1773901334.0, + "reward": 2.4324777126312256, + "reward_std": 0.4031684994697571, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1276179552078247, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 898.8147583007812, + "completions/mean_terminated_length": 767.31591796875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6840338820521017, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12885378471931858, + "kl": 0.032135009765625, + "learning_rate": 3.4748102870866536e-07, + "loss": 0.0898, + "num_tokens": 1774365251.0, + "reward": 2.529576063156128, + "reward_std": 0.4196571707725525, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10681798309087753, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1020.7500610351562, + "completions/mean_terminated_length": 807.5471801757812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6842469767194076, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1366695324226509, + "kl": 0.029052734375, + "learning_rate": 3.471821363166854e-07, + "loss": 0.0811, + "num_tokens": 1774890339.0, + "reward": 2.446986675262451, + "reward_std": 0.3788634240627289, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.1770157665014267, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1157.578125, + "completions/mean_terminated_length": 885.0000610351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6844600713867135, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15311054874881494, + "kl": 0.02508544921875, + "learning_rate": 3.4688335616886866e-07, + "loss": 0.0879, + "num_tokens": 1775488022.0, + "reward": 2.3275671005249023, + "reward_std": 0.4805065393447876, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1640053689479828, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 812.6451416015625, + "completions/mean_terminated_length": 707.9540405273438, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.6846731660540195, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14260398135626523, + "kl": 0.032989501953125, + "learning_rate": 3.465846884305668e-07, + "loss": 0.0861, + "num_tokens": 1775916663.0, + "reward": 2.5708706378936768, + "reward_std": 0.45875486731529236, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 0.45011183619499207, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.14974473416805267, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1144.140625, + "completions/mean_terminated_length": 867.4490356445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6848862607213254, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12313410358365717, + "kl": 0.024871826171875, + "learning_rate": 3.462861332670699e-07, + "loss": 0.0492, + "num_tokens": 1776500470.0, + "reward": 2.416294813156128, + "reward_std": 0.4591118395328522, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 942.4420166015625, + "completions/mean_terminated_length": 794.1012573242188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6850993553886314, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11684252435939976, + "kl": 0.026336669921875, + "learning_rate": 3.4598769084360535e-07, + "loss": 0.0344, + "num_tokens": 1776993468.0, + "reward": 2.4168527126312256, + "reward_std": 0.3923453688621521, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12359261512756348, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 842.3795166015625, + "completions/mean_terminated_length": 717.6600952148438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6853124500559373, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14330222678984342, + "kl": 0.033660888671875, + "learning_rate": 3.456893613253381e-07, + "loss": 0.0646, + "num_tokens": 1777442342.0, + "reward": 2.4810268878936768, + "reward_std": 0.47261562943458557, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15391045808792114, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1125.04248046875, + "completions/mean_terminated_length": 869.9800415039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6855255447232433, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11719815097562283, + "kl": 0.023406982421875, + "learning_rate": 3.453911448773707e-07, + "loss": 0.0733, + "num_tokens": 1778027737.0, + "reward": 2.23828125, + "reward_std": 0.4467274248600006, + "rewards/accuracy_reward/mean": 0.3370535671710968, + "rewards/accuracy_reward/std": 0.4732317626476288, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.150824636220932, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1037.3482666015625, + "completions/mean_terminated_length": 779.7311401367188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6857386393905492, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.10334794752639039, + "kl": 0.025543212890625, + "learning_rate": 3.450930416647429e-07, + "loss": 0.0421, + "num_tokens": 1778558501.0, + "reward": 2.404017925262451, + "reward_std": 0.4223562777042389, + "rewards/accuracy_reward/mean": 0.5324074029922485, + "rewards/accuracy_reward/std": 0.49952712655067444, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1457662582397461, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1056.2366943359375, + "completions/mean_terminated_length": 824.0054931640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6859517340578553, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1322004363509126, + "kl": 0.02593994140625, + "learning_rate": 3.447950518524327e-07, + "loss": 0.0785, + "num_tokens": 1779095471.0, + "reward": 2.39453125, + "reward_std": 0.46494632959365845, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1089.634033203125, + "completions/mean_terminated_length": 841.96630859375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6861648287251612, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12935176748098937, + "kl": 0.027313232421875, + "learning_rate": 3.44497175605354e-07, + "loss": 0.06, + "num_tokens": 1779650267.0, + "reward": 2.25, + "reward_std": 0.4300120174884796, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1523328423500061, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1063.055908203125, + "completions/mean_terminated_length": 842.3851928710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6863779233924671, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12782868625535562, + "kl": 0.0262451171875, + "learning_rate": 3.441994130883584e-07, + "loss": 0.091, + "num_tokens": 1780195444.0, + "reward": 2.3214287757873535, + "reward_std": 0.47149014472961426, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15850186347961426, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 857.0313110351562, + "completions/mean_terminated_length": 697.2304077148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6865910180597731, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1432209289558248, + "kl": 0.03350830078125, + "learning_rate": 3.439017644662353e-07, + "loss": 0.0625, + "num_tokens": 1780647202.0, + "reward": 2.587611675262451, + "reward_std": 0.35203656554222107, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.09263478219509125, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1007.3147583007812, + "completions/mean_terminated_length": 804.7279663085938, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.686804112727079, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12949679308296447, + "kl": 0.028076171875, + "learning_rate": 3.4360422990371006e-07, + "loss": 0.0646, + "num_tokens": 1781170271.0, + "reward": 2.4575893878936768, + "reward_std": 0.4232456386089325, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 987.6652221679688, + "completions/mean_terminated_length": 797.9210815429688, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.687017207394385, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1200548861840904, + "kl": 0.02935791015625, + "learning_rate": 3.4330680956544535e-07, + "loss": 0.049, + "num_tokens": 1781687433.0, + "reward": 2.4481027126312256, + "reward_std": 0.4604281783103943, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13776934146881104, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1062.3348388671875, + "completions/mean_terminated_length": 882.8865966796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6872303020616909, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12295632693412772, + "kl": 0.026611328125, + "learning_rate": 3.4300950361604023e-07, + "loss": 0.0862, + "num_tokens": 1782231391.0, + "reward": 2.453125, + "reward_std": 0.37440693378448486, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12717820703983307, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 922.1964721679688, + "completions/mean_terminated_length": 727.6858520507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6874433967289969, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14504560021625246, + "kl": 0.029144287109375, + "learning_rate": 3.4271231222003137e-07, + "loss": 0.0742, + "num_tokens": 1782718631.0, + "reward": 2.396763563156128, + "reward_std": 0.419214129447937, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13978439569473267, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1099.1473388671875, + "completions/mean_terminated_length": 843.7903442382812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6876564913963028, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12388396314078683, + "kl": 0.025848388671875, + "learning_rate": 3.424152355418913e-07, + "loss": 0.0875, + "num_tokens": 1783277209.0, + "reward": 2.506138563156128, + "reward_std": 0.49026402831077576, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 942.4553833007812, + "completions/mean_terminated_length": 771.4948120117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6878695860636087, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1342325794874321, + "kl": 0.02734375, + "learning_rate": 3.4211827374602875e-07, + "loss": 0.0628, + "num_tokens": 1783765813.0, + "reward": 2.533482313156128, + "reward_std": 0.4199574589729309, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12198749929666519, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 991.90185546875, + "completions/mean_terminated_length": 772.7115478515625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6880826807309147, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12903442656014882, + "kl": 0.027740478515625, + "learning_rate": 3.4182142699678987e-07, + "loss": 0.0831, + "num_tokens": 1784279401.0, + "reward": 2.478794813156128, + "reward_std": 0.3849352300167084, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12585100531578064, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 930.3839721679688, + "completions/mean_terminated_length": 780.42529296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6882957753982206, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11987131534352367, + "kl": 0.0283203125, + "learning_rate": 3.4152469545845646e-07, + "loss": 0.0859, + "num_tokens": 1784767653.0, + "reward": 2.529576063156128, + "reward_std": 0.35763558745384216, + "rewards/accuracy_reward/mean": 0.6342592835426331, + "rewards/accuracy_reward/std": 0.482195645570755, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09374666959047318, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1047.3973388671875, + "completions/mean_terminated_length": 862.1005249023438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.6885088700655266, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11484671446180356, + "kl": 0.02593994140625, + "learning_rate": 3.412280792952467e-07, + "loss": 0.0905, + "num_tokens": 1785301079.0, + "reward": 2.368861675262451, + "reward_std": 0.4609025716781616, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14149756729602814, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1138.4241943359375, + "completions/mean_terminated_length": 863.43603515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6887219647328325, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 1.9967743485306602, + "kl": 0.045989990234375, + "learning_rate": 3.4093157867131483e-07, + "loss": 0.091, + "num_tokens": 1785886837.0, + "reward": 2.2801339626312256, + "reward_std": 0.4846292734146118, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9408482313156128, + "rewards/tag_count_reward/std": 0.20512165129184723, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1032.57373046875, + "completions/mean_terminated_length": 828.3994750976562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6889350594001385, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13882914908989316, + "kl": 0.02685546875, + "learning_rate": 3.4063519375075123e-07, + "loss": 0.1173, + "num_tokens": 1786413446.0, + "reward": 2.5385046005249023, + "reward_std": 0.43064308166503906, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561823636293411, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1056.1116943359375, + "completions/mean_terminated_length": 803.27734375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.6891481540674445, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12581406935550796, + "kl": 0.026611328125, + "learning_rate": 3.4033892469758256e-07, + "loss": 0.0941, + "num_tokens": 1786958120.0, + "reward": 2.467076063156128, + "reward_std": 0.5067455172538757, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16741061210632324, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1054.4085693359375, + "completions/mean_terminated_length": 831.800537109375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.6893612487347505, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1266715869994, + "kl": 0.026123046875, + "learning_rate": 3.400427716757709e-07, + "loss": 0.0994, + "num_tokens": 1787496735.0, + "reward": 2.3989956378936768, + "reward_std": 0.5269179344177246, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.18444819748401642, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 888.77685546875, + "completions/mean_terminated_length": 746.416015625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6895743434020564, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13999207710826805, + "kl": 0.031097412109375, + "learning_rate": 3.3974673484921424e-07, + "loss": 0.0757, + "num_tokens": 1787963323.0, + "reward": 2.4994421005249023, + "reward_std": 0.39913371205329895, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14177079498767853, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 988.77685546875, + "completions/mean_terminated_length": 840.5394287109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6897874380693623, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1296151895488594, + "kl": 0.02618408203125, + "learning_rate": 3.394508143817464e-07, + "loss": 0.0935, + "num_tokens": 1788478071.0, + "reward": 2.4252233505249023, + "reward_std": 0.38937118649482727, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12956565618515015, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 959.5223388671875, + "completions/mean_terminated_length": 774.7937622070312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6900005327366683, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12981929883126386, + "kl": 0.02911376953125, + "learning_rate": 3.3915501043713653e-07, + "loss": 0.0661, + "num_tokens": 1788982961.0, + "reward": 2.548549175262451, + "reward_std": 0.4418761134147644, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.13086353242397308, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 956.4063110351562, + "completions/mean_terminated_length": 764.4461669921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6902136274039742, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1337861017809418, + "kl": 0.02935791015625, + "learning_rate": 3.3885932317908954e-07, + "loss": 0.0421, + "num_tokens": 1789476887.0, + "reward": 2.484375, + "reward_std": 0.4053283929824829, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 903.388427734375, + "completions/mean_terminated_length": 756.3475952148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6904267220712802, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12671813006467553, + "kl": 0.02691650390625, + "learning_rate": 3.3856375277124567e-07, + "loss": 0.0251, + "num_tokens": 1789955605.0, + "reward": 2.540736675262451, + "reward_std": 0.4351884722709656, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11358113586902618, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 960.2522583007812, + "completions/mean_terminated_length": 775.6475219726562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6906398167385861, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1382367425193716, + "kl": 0.028228759765625, + "learning_rate": 3.382682993771807e-07, + "loss": 0.09, + "num_tokens": 1790459990.0, + "reward": 2.419642925262451, + "reward_std": 0.4065806567668915, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12842853367328644, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 968.9397583007812, + "completions/mean_terminated_length": 798.8553466796875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.6908529114058921, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14050551075254888, + "kl": 0.028839111328125, + "learning_rate": 3.3797296316040533e-07, + "loss": 0.1015, + "num_tokens": 1790969083.0, + "reward": 2.5089287757873535, + "reward_std": 0.46475881338119507, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.12457982450723648, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1029.7991943359375, + "completions/mean_terminated_length": 794.8297119140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.691066006073198, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10930864713073257, + "kl": 0.0264892578125, + "learning_rate": 3.376777442843656e-07, + "loss": 0.0658, + "num_tokens": 1791499089.0, + "reward": 2.431919813156128, + "reward_std": 0.4017297029495239, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.1088741347193718, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1032.54248046875, + "completions/mean_terminated_length": 811.790771484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6912791007405039, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1319419259679159, + "kl": 0.024505615234375, + "learning_rate": 3.373826429124425e-07, + "loss": 0.0952, + "num_tokens": 1792034740.0, + "reward": 2.3761162757873535, + "reward_std": 0.42495864629745483, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.146973118185997, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1028.7388916015625, + "completions/mean_terminated_length": 800.3797607421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6914921954078099, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13124120719865734, + "kl": 0.027130126953125, + "learning_rate": 3.370876592079519e-07, + "loss": 0.0933, + "num_tokens": 1792566735.0, + "reward": 2.4676339626312256, + "reward_std": 0.3682340383529663, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12695714831352234, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 975.482177734375, + "completions/mean_terminated_length": 752.8840942382812, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.6917052900751158, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1307720831033583, + "kl": 0.0272216796875, + "learning_rate": 3.367927933341453e-07, + "loss": 0.0733, + "num_tokens": 1793072087.0, + "reward": 2.478236675262451, + "reward_std": 0.36143651604652405, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067523807287216, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1032.837158203125, + "completions/mean_terminated_length": 831.9759521484375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6919183847424218, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13823756019022623, + "kl": 0.024658203125, + "learning_rate": 3.3649804545420747e-07, + "loss": 0.0801, + "num_tokens": 1793604430.0, + "reward": 2.3950893878936768, + "reward_std": 0.5093016624450684, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15850186347961426, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1013.0826416015625, + "completions/mean_terminated_length": 827.8869018554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6921314794097277, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12939242129823833, + "kl": 0.02642822265625, + "learning_rate": 3.3620341573125954e-07, + "loss": 0.0818, + "num_tokens": 1794131747.0, + "reward": 2.3543527126312256, + "reward_std": 0.49827271699905396, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9369419813156128, + "rewards/tag_count_reward/std": 0.20292110741138458, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 972.919677734375, + "completions/mean_terminated_length": 777.192626953125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6923445740770338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13638528758844093, + "kl": 0.029876708984375, + "learning_rate": 3.359089043283563e-07, + "loss": 0.1124, + "num_tokens": 1794637071.0, + "reward": 2.466517925262451, + "reward_std": 0.44474413990974426, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13220298290252686, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 763.6205444335938, + "completions/mean_terminated_length": 637.7009887695312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6925576687443397, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15979077152984286, + "kl": 0.0335693359375, + "learning_rate": 3.3561451140848723e-07, + "loss": 0.073, + "num_tokens": 1795040165.0, + "reward": 2.5379464626312256, + "reward_std": 0.39149630069732666, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1008.9598388671875, + "completions/mean_terminated_length": 779.6348876953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6927707634116457, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13844661475026718, + "kl": 0.027191162109375, + "learning_rate": 3.3532023713457636e-07, + "loss": 0.0708, + "num_tokens": 1795568115.0, + "reward": 2.279017925262451, + "reward_std": 0.404011607170105, + "rewards/accuracy_reward/mean": 0.40509259700775146, + "rewards/accuracy_reward/std": 0.49147912859916687, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15182389318943024, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1026.97998046875, + "completions/mean_terminated_length": 791.3599243164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6929838580789516, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13752420654232853, + "kl": 0.02484130859375, + "learning_rate": 3.350260816694816e-07, + "loss": 0.1021, + "num_tokens": 1796103690.0, + "reward": 2.368861675262451, + "reward_std": 0.439092218875885, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14874933660030365, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 965.0402221679688, + "completions/mean_terminated_length": 729.6141357421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6931969527462575, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13367002497597577, + "kl": 0.029052734375, + "learning_rate": 3.347320451759962e-07, + "loss": 0.0791, + "num_tokens": 1796602748.0, + "reward": 2.4771206378936768, + "reward_std": 0.3418791890144348, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11898136883974075, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1026.055908203125, + "completions/mean_terminated_length": 779.7700805664062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6934100474135635, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13465163305440037, + "kl": 0.026123046875, + "learning_rate": 3.3443812781684613e-07, + "loss": 0.065, + "num_tokens": 1797137861.0, + "reward": 2.3895089626312256, + "reward_std": 0.5091385841369629, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17114688456058502, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1013.6295166015625, + "completions/mean_terminated_length": 795.572998046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6936231420808694, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14682133717412618, + "kl": 0.0302734375, + "learning_rate": 3.341443297546925e-07, + "loss": 0.0997, + "num_tokens": 1797655999.0, + "reward": 2.4754464626312256, + "reward_std": 0.4947318434715271, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.1693466603755951, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 970.2745971679688, + "completions/mean_terminated_length": 797.1683959960938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6938362367481754, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14476095030009653, + "kl": 0.027984619140625, + "learning_rate": 3.338506511521301e-07, + "loss": 0.105, + "num_tokens": 1798165946.0, + "reward": 2.3521206378936768, + "reward_std": 0.45912423729896545, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.150824636220932, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 918.482177734375, + "completions/mean_terminated_length": 723.329833984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6940493314154813, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12726409027995148, + "kl": 0.029693603515625, + "learning_rate": 3.335570921716875e-07, + "loss": 0.0655, + "num_tokens": 1798638306.0, + "reward": 2.513951063156128, + "reward_std": 0.4008297324180603, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561823636293411, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 825.7902221679688, + "completions/mean_terminated_length": 679.125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6942624260827873, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15922719171485278, + "kl": 0.030517578125, + "learning_rate": 3.33263652975827e-07, + "loss": 0.078, + "num_tokens": 1799080180.0, + "reward": 2.431919813156128, + "reward_std": 0.3773408830165863, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11987641453742981, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1050.40185546875, + "completions/mean_terminated_length": 774.7122192382812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6944755207500932, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12576416285989886, + "kl": 0.026702880859375, + "learning_rate": 3.3297033372694473e-07, + "loss": 0.1206, + "num_tokens": 1799625912.0, + "reward": 2.3470983505249023, + "reward_std": 0.45343929529190063, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15964375436306, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1016.7902221679688, + "completions/mean_terminated_length": 825.8253784179688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6946886154173991, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12008035074118424, + "kl": 0.025604248046875, + "learning_rate": 3.326771345873706e-07, + "loss": 0.0842, + "num_tokens": 1800148218.0, + "reward": 2.364955425262451, + "reward_std": 0.4011920690536499, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14979995787143707, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1040.180908203125, + "completions/mean_terminated_length": 800.754150390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6949017100847051, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12962964994846102, + "kl": 0.028900146484375, + "learning_rate": 3.323840557193681e-07, + "loss": 0.094, + "num_tokens": 1800681083.0, + "reward": 2.33984375, + "reward_std": 0.526114284992218, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15764793753623962, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 964.810302734375, + "completions/mean_terminated_length": 736.462158203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.695114804752011, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13680874924568992, + "kl": 0.02960205078125, + "learning_rate": 3.320910972851333e-07, + "loss": 0.125, + "num_tokens": 1801186694.0, + "reward": 2.3470983505249023, + "reward_std": 0.4431781768798828, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13763901591300964, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1009.7656860351562, + "completions/mean_terminated_length": 839.8727416992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.695327899419317, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13551004048073556, + "kl": 0.02764892578125, + "learning_rate": 3.3179825944679683e-07, + "loss": 0.1295, + "num_tokens": 1801710861.0, + "reward": 2.404576063156128, + "reward_std": 0.499032586812973, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1518804132938385, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1018.6897583007812, + "completions/mean_terminated_length": 805.0592651367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.695540994086623, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1283599951794352, + "kl": 0.02813720703125, + "learning_rate": 3.315055423664217e-07, + "loss": 0.0614, + "num_tokens": 1802238402.0, + "reward": 2.4090402126312256, + "reward_std": 0.4992900490760803, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1369149088859558, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 928.1116333007812, + "completions/mean_terminated_length": 761.5641479492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.695754088753929, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1314950921602472, + "kl": 0.0323486328125, + "learning_rate": 3.312129462060048e-07, + "loss": 0.0047, + "num_tokens": 1802721044.0, + "reward": 2.4603796005249023, + "reward_std": 0.35351112484931946, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09374666959047318, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1018.9710083007812, + "completions/mean_terminated_length": 778.0137939453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6959671834212349, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11803846573079549, + "kl": 0.02752685546875, + "learning_rate": 3.3092047112747514e-07, + "loss": 0.0638, + "num_tokens": 1803248375.0, + "reward": 2.443638563156128, + "reward_std": 0.38853222131729126, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11970315873622894, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 936.8482666015625, + "completions/mean_terminated_length": 778.1122436523438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6961802780885409, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1489322828575107, + "kl": 0.028350830078125, + "learning_rate": 3.306281172926959e-07, + "loss": 0.0931, + "num_tokens": 1803735795.0, + "reward": 2.509486675262451, + "reward_std": 0.4343414306640625, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10681798309087753, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 929.4219360351562, + "completions/mean_terminated_length": 753.1085205078125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6963933727558468, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14209620506468532, + "kl": 0.031402587890625, + "learning_rate": 3.303358848634621e-07, + "loss": 0.1164, + "num_tokens": 1804218496.0, + "reward": 2.4620537757873535, + "reward_std": 0.4689207375049591, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 877.419677734375, + "completions/mean_terminated_length": 706.7723999023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6966064674231527, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13174252037452464, + "kl": 0.02960205078125, + "learning_rate": 3.300437740015022e-07, + "loss": 0.1134, + "num_tokens": 1804688028.0, + "reward": 2.62890625, + "reward_std": 0.40082108974456787, + "rewards/accuracy_reward/mean": 0.7053571343421936, + "rewards/accuracy_reward/std": 0.45639166235923767, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10744724422693253, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 876.2656860351562, + "completions/mean_terminated_length": 702.0076904296875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6968195620904587, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13423189162957397, + "kl": 0.030487060546875, + "learning_rate": 3.2975178486847724e-07, + "loss": 0.0876, + "num_tokens": 1805160755.0, + "reward": 2.4693081378936768, + "reward_std": 0.4009111225605011, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13124457001686096, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1074.04248046875, + "completions/mean_terminated_length": 804.8860473632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6970326567577646, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14131694558300043, + "kl": 0.026397705078125, + "learning_rate": 3.2945991762598054e-07, + "loss": 0.0731, + "num_tokens": 1805707270.0, + "reward": 2.23828125, + "reward_std": 0.4340845048427582, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.1539783775806427, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 944.7410888671875, + "completions/mean_terminated_length": 764.207763671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6972457514250706, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.22698029651585092, + "kl": 0.031524658203125, + "learning_rate": 3.2916817243553885e-07, + "loss": 0.084, + "num_tokens": 1806208434.0, + "reward": 2.4464287757873535, + "reward_std": 0.5017050504684448, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1353386640548706, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 912.4219360351562, + "completions/mean_terminated_length": 719.6997680664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6974588460923765, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13584003178857615, + "kl": 0.030487060546875, + "learning_rate": 3.288765494586104e-07, + "loss": 0.0535, + "num_tokens": 1806679279.0, + "reward": 2.579799175262451, + "reward_std": 0.3927724063396454, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13826683163642883, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1034.571533203125, + "completions/mean_terminated_length": 783.3314819335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6976719407596825, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11238609365077733, + "kl": 0.02520751953125, + "learning_rate": 3.285850488565861e-07, + "loss": 0.0366, + "num_tokens": 1807212863.0, + "reward": 2.3660714626312256, + "reward_std": 0.3291527032852173, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.11074430495500565, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 989.779052734375, + "completions/mean_terminated_length": 773.5833129882812, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6978850354269884, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13342683138492548, + "kl": 0.026092529296875, + "learning_rate": 3.282936707907895e-07, + "loss": 0.0752, + "num_tokens": 1807724876.0, + "reward": 2.421875, + "reward_std": 0.39623740315437317, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12951265275478363, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1007.904052734375, + "completions/mean_terminated_length": 785.2276611328125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6980981300942944, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12141763965678348, + "kl": 0.02813720703125, + "learning_rate": 3.28002415422476e-07, + "loss": 0.0985, + "num_tokens": 1808245169.0, + "reward": 2.3526787757873535, + "reward_std": 0.436323344707489, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13106490671634674, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1065.654052734375, + "completions/mean_terminated_length": 811.789306640625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6983112247616003, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.5026496786030623, + "kl": 0.026611328125, + "learning_rate": 3.27711282912833e-07, + "loss": 0.0591, + "num_tokens": 1808797814.0, + "reward": 2.2901787757873535, + "reward_std": 0.4232753813266754, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16370916366577148, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 915.794677734375, + "completions/mean_terminated_length": 727.09375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6985243194289062, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12542396628115388, + "kl": 0.026763916015625, + "learning_rate": 3.2742027342298013e-07, + "loss": 0.0094, + "num_tokens": 1809282330.0, + "reward": 2.4129464626312256, + "reward_std": 0.3796856999397278, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9910714030265808, + "rewards/tag_count_reward/std": 0.06192811205983162, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 893.7656860351562, + "completions/mean_terminated_length": 795.9491577148438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.6987374140962123, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1308948560282255, + "kl": 0.030181884765625, + "learning_rate": 3.271293871139689e-07, + "loss": 0.0513, + "num_tokens": 1809749585.0, + "reward": 2.541294813156128, + "reward_std": 0.4330494701862335, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709372282028198, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 864.2567138671875, + "completions/mean_terminated_length": 751.3814697265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6989505087635182, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13624695986172078, + "kl": 0.030609130859375, + "learning_rate": 3.26838624146783e-07, + "loss": 0.0662, + "num_tokens": 1810206516.0, + "reward": 2.428013563156128, + "reward_std": 0.33159682154655457, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10481233894824982, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 968.5558471679688, + "completions/mean_terminated_length": 804.83544921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6991636034308242, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1228726696466669, + "kl": 0.029266357421875, + "learning_rate": 3.2654798468233656e-07, + "loss": 0.0358, + "num_tokens": 1810710893.0, + "reward": 2.3856027126312256, + "reward_std": 0.42100322246551514, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.0940391793847084, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 979.19873046875, + "completions/mean_terminated_length": 810.7312622070312, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.6993766980981301, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12560129284198732, + "kl": 0.02667236328125, + "learning_rate": 3.2625746888147705e-07, + "loss": 0.0942, + "num_tokens": 1811218230.0, + "reward": 2.5518975257873535, + "reward_std": 0.37651193141937256, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13276717066764832, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 930.0335083007812, + "completions/mean_terminated_length": 736.876953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6995897927654361, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1511039599780612, + "kl": 0.027862548828125, + "learning_rate": 3.259670769049824e-07, + "loss": 0.1434, + "num_tokens": 1811704821.0, + "reward": 2.4425225257873535, + "reward_std": 0.35751911997795105, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396106719971, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10230488330125809, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1108.274658203125, + "completions/mean_terminated_length": 858.742919921875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.699802887432742, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11136553704779555, + "kl": 0.026123046875, + "learning_rate": 3.256768089135625e-07, + "loss": 0.0336, + "num_tokens": 1812272016.0, + "reward": 2.3309152126312256, + "reward_std": 0.4128180146217346, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14590215682983398, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 997.58935546875, + "completions/mean_terminated_length": 786.3807373046875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7000159821000479, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.125393409447218, + "kl": 0.027252197265625, + "learning_rate": 3.253866650678584e-07, + "loss": 0.085, + "num_tokens": 1812796712.0, + "reward": 2.4168527126312256, + "reward_std": 0.40719953179359436, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1276179552078247, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 988.4308471679688, + "completions/mean_terminated_length": 782.16796875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7002290767673539, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13794475752739013, + "kl": 0.02886962890625, + "learning_rate": 3.250966455284423e-07, + "loss": 0.0681, + "num_tokens": 1813309481.0, + "reward": 2.4966518878936768, + "reward_std": 0.4522949755191803, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709372282028198, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 859.6719360351562, + "completions/mean_terminated_length": 665.2181396484375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.7004421714346598, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16778808787969596, + "kl": 0.031646728515625, + "learning_rate": 3.248067504558182e-07, + "loss": 0.1045, + "num_tokens": 1813765862.0, + "reward": 2.5078125, + "reward_std": 0.445243239402771, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.146973118185997, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 996.716552734375, + "completions/mean_terminated_length": 795.4068603515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7006552661019658, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12289475780202898, + "kl": 0.026153564453125, + "learning_rate": 3.2451698001042073e-07, + "loss": 0.0582, + "num_tokens": 1814285383.0, + "reward": 2.3878350257873535, + "reward_std": 0.40453317761421204, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005797147750854, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13566918671131134, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 994.5357666015625, + "completions/mean_terminated_length": 837.86669921875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.7008683607692717, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1354577387269475, + "kl": 0.027740478515625, + "learning_rate": 3.242273343526154e-07, + "loss": 0.0689, + "num_tokens": 1814798615.0, + "reward": 2.5262277126312256, + "reward_std": 0.4395878314971924, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780036240816116, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1063.0335693359375, + "completions/mean_terminated_length": 776.3428955078125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7010814554365777, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1240309569726836, + "kl": 0.0245361328125, + "learning_rate": 3.23937813642699e-07, + "loss": 0.0648, + "num_tokens": 1815344070.0, + "reward": 2.4620537757873535, + "reward_std": 0.47612667083740234, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16768723726272583, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 923.63623046875, + "completions/mean_terminated_length": 769.5355224609375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7012945501038836, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13770741847895607, + "kl": 0.028961181640625, + "learning_rate": 3.2364841804089956e-07, + "loss": 0.0717, + "num_tokens": 1815825459.0, + "reward": 2.5396206378936768, + "reward_std": 0.4038306176662445, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11170817166566849, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 996.51123046875, + "completions/mean_terminated_length": 855.42529296875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7015076447711897, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13289246472798402, + "kl": 0.02838134765625, + "learning_rate": 3.233591477073747e-07, + "loss": 0.1051, + "num_tokens": 1816340360.0, + "reward": 2.439732313156128, + "reward_std": 0.4351404905319214, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13271193206310272, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 898.919677734375, + "completions/mean_terminated_length": 738.1068725585938, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7017207394384956, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14747922268824798, + "kl": 0.032470703125, + "learning_rate": 3.2307000280221363e-07, + "loss": 0.0857, + "num_tokens": 1816819812.0, + "reward": 2.5496652126312256, + "reward_std": 0.4561227262020111, + "rewards/accuracy_reward/mean": 0.6785714030265808, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151519536972046, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 991.3058471679688, + "completions/mean_terminated_length": 775.4220581054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7019338341058015, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12777596855568987, + "kl": 0.02947998046875, + "learning_rate": 3.227809834854361e-07, + "loss": 0.0807, + "num_tokens": 1817328749.0, + "reward": 2.4453125, + "reward_std": 0.4548998475074768, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.1865052878856659, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 962.5245971679688, + "completions/mean_terminated_length": 726.5516357421875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7021469287731075, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1384345481062676, + "kl": 0.0284423828125, + "learning_rate": 3.224920899169922e-07, + "loss": 0.0823, + "num_tokens": 1817822792.0, + "reward": 2.4838171005249023, + "reward_std": 0.46545371413230896, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15912973880767822, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1045.509033203125, + "completions/mean_terminated_length": 789.9719848632812, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7023600234404134, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 1.0905837870370134, + "kl": 0.075958251953125, + "learning_rate": 3.2220332225676215e-07, + "loss": 0.1118, + "num_tokens": 1818366172.0, + "reward": 2.4291296005249023, + "reward_std": 0.4681529402732849, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16403579711914062, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 976.0670166015625, + "completions/mean_terminated_length": 787.5643310546875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7025731181077194, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12743050237934564, + "kl": 0.027984619140625, + "learning_rate": 3.2191468066455694e-07, + "loss": 0.075, + "num_tokens": 1818866426.0, + "reward": 2.6729912757873535, + "reward_std": 0.43493738770484924, + "rewards/accuracy_reward/mean": 0.765625, + "rewards/accuracy_reward/std": 0.42408111691474915, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14409084618091583, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1101.30810546875, + "completions/mean_terminated_length": 853.3013916015625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7027862127750253, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14079531550953045, + "kl": 0.027069091796875, + "learning_rate": 3.216261653001174e-07, + "loss": 0.0873, + "num_tokens": 1819431508.0, + "reward": 2.3900671005249023, + "reward_std": 0.4381040036678314, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1645980179309845, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1002.4285888671875, + "completions/mean_terminated_length": 808.80419921875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7029993074423313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13705097501633545, + "kl": 0.029296875, + "learning_rate": 3.213377763231151e-07, + "loss": 0.0711, + "num_tokens": 1819946788.0, + "reward": 2.318638563156128, + "reward_std": 0.5260837078094482, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15933358669281006, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1003.5357666015625, + "completions/mean_terminated_length": 779.9241333007812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7032124021096372, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1254106013563999, + "kl": 0.02789306640625, + "learning_rate": 3.2104951389315073e-07, + "loss": 0.0933, + "num_tokens": 1820466884.0, + "reward": 2.5072546005249023, + "reward_std": 0.437931627035141, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12450841069221497, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 990.77685546875, + "completions/mean_terminated_length": 774.7849731445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7034254967769431, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12925601519596436, + "kl": 0.02734375, + "learning_rate": 3.2076137816975593e-07, + "loss": 0.0673, + "num_tokens": 1820978832.0, + "reward": 2.3900671005249023, + "reward_std": 0.3944319486618042, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.19573107361793518, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1114.0513916015625, + "completions/mean_terminated_length": 882.5153198242188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7036385914442491, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.16011009139431864, + "kl": 0.025848388671875, + "learning_rate": 3.204733693123916e-07, + "loss": 0.0795, + "num_tokens": 1821545799.0, + "reward": 2.3348214626312256, + "reward_std": 0.552046000957489, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.18524597585201263, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1186.859375, + "completions/mean_terminated_length": 846.1588745117188, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.703851686111555, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12750023586832318, + "kl": 0.022796630859375, + "learning_rate": 3.201854874804485e-07, + "loss": 0.083, + "num_tokens": 1822148200.0, + "reward": 2.303013563156128, + "reward_std": 0.46007663011550903, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.8794642686843872, + "rewards/format_reward/std": 0.3259509205818176, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1571800261735916, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1055.1004638671875, + "completions/mean_terminated_length": 794.9887084960938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.704064780778861, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14221797129167296, + "kl": 0.026275634765625, + "learning_rate": 3.1989773283324736e-07, + "loss": 0.0912, + "num_tokens": 1822687653.0, + "reward": 2.3041296005249023, + "reward_std": 0.47645753622055054, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.8839285969734192, + "rewards/format_reward/std": 0.32066863775253296, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.15283909440040588, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 875.9330444335938, + "completions/mean_terminated_length": 751.4913940429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7042778754461669, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13598281697186068, + "kl": 0.030303955078125, + "learning_rate": 3.1961010553003806e-07, + "loss": 0.0685, + "num_tokens": 1823148423.0, + "reward": 2.490513563156128, + "reward_std": 0.3628806173801422, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690935015678406, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.1057254895567894, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 955.8438110351562, + "completions/mean_terminated_length": 777.1272583007812, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.704490970113473, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.118936527033922, + "kl": 0.028839111328125, + "learning_rate": 3.193226057300007e-07, + "loss": 0.056, + "num_tokens": 1823647841.0, + "reward": 2.533482313156128, + "reward_std": 0.36338889598846436, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09024729579687119, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1107.0223388671875, + "completions/mean_terminated_length": 826.0927734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7047040647807788, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12489503230430565, + "kl": 0.02337646484375, + "learning_rate": 3.1903523359224416e-07, + "loss": 0.1135, + "num_tokens": 1824217003.0, + "reward": 2.2840402126312256, + "reward_std": 0.49713265895843506, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16202186048030853, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 959.7120971679688, + "completions/mean_terminated_length": 744.3823852539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7049171594480849, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13976667539604745, + "kl": 0.0284423828125, + "learning_rate": 3.1874798927580703e-07, + "loss": 0.0885, + "num_tokens": 1824716506.0, + "reward": 2.43359375, + "reward_std": 0.44176995754241943, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13700605928897858, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1196.7523193359375, + "completions/mean_terminated_length": 892.3666381835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7051302541153908, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.24957462123262, + "kl": 0.065155029296875, + "learning_rate": 3.1846087293965705e-07, + "loss": 0.068, + "num_tokens": 1825324011.0, + "reward": 2.3130581378936768, + "reward_std": 0.44048577547073364, + "rewards/accuracy_reward/mean": 0.43287035822868347, + "rewards/accuracy_reward/std": 0.4960475564002991, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.12858274579048157, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1117.540283203125, + "completions/mean_terminated_length": 863.7784423828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7053433487826967, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12080196882019427, + "kl": 0.023529052734375, + "learning_rate": 3.1817388474269104e-07, + "loss": 0.0969, + "num_tokens": 1825899037.0, + "reward": 2.4051339626312256, + "reward_std": 0.4640944302082062, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13503853976726532, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1146.555908203125, + "completions/mean_terminated_length": 880.8121337890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7055564434500027, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12318427952721635, + "kl": 0.023834228515625, + "learning_rate": 3.17887024843735e-07, + "loss": 0.0895, + "num_tokens": 1826481734.0, + "reward": 2.3158483505249023, + "reward_std": 0.40195152163505554, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13814601302146912, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1032.0692138671875, + "completions/mean_terminated_length": 758.6600341796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7057695381173086, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13737603757207636, + "kl": 0.028106689453125, + "learning_rate": 3.1760029340154395e-07, + "loss": 0.0925, + "num_tokens": 1827014261.0, + "reward": 2.3515625, + "reward_std": 0.40651756525039673, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17337898910045624, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1134.9398193359375, + "completions/mean_terminated_length": 851.9444580078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7059826327846146, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13012824840905396, + "kl": 0.02520751953125, + "learning_rate": 3.173136905748018e-07, + "loss": 0.0709, + "num_tokens": 1827596650.0, + "reward": 2.3392858505249023, + "reward_std": 0.43294811248779297, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14480386674404144, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 961.7567138671875, + "completions/mean_terminated_length": 753.7526245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7061957274519205, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14694851496720768, + "kl": 0.028076171875, + "learning_rate": 3.1702721652212163e-07, + "loss": 0.1154, + "num_tokens": 1828098445.0, + "reward": 2.3370537757873535, + "reward_std": 0.4137912094593048, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1554640233516693, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1006.388427734375, + "completions/mean_terminated_length": 810.2227783203125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.7064088221192265, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13507016009995815, + "kl": 0.027099609375, + "learning_rate": 3.167408714020442e-07, + "loss": 0.1006, + "num_tokens": 1828614683.0, + "reward": 2.390625, + "reward_std": 0.440521240234375, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13888955116271973, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1129.3795166015625, + "completions/mean_terminated_length": 858.572265625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.7066219167865324, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13477461935518678, + "kl": 0.023773193359375, + "learning_rate": 3.164546553730401e-07, + "loss": 0.0955, + "num_tokens": 1829185861.0, + "reward": 2.4051339626312256, + "reward_std": 0.4446280598640442, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1566121131181717, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 950.9129638671875, + "completions/mean_terminated_length": 777.9871215820312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7068350114538383, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1375163035637281, + "kl": 0.031097412109375, + "learning_rate": 3.161685685935077e-07, + "loss": 0.0591, + "num_tokens": 1829679198.0, + "reward": 2.5011162757873535, + "reward_std": 0.3659578561782837, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.1194591298699379, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1156.57373046875, + "completions/mean_terminated_length": 866.4645385742188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7070481061211443, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12488575807549228, + "kl": 0.024200439453125, + "learning_rate": 3.158826112217747e-07, + "loss": 0.0518, + "num_tokens": 1830272735.0, + "reward": 2.3621652126312256, + "reward_std": 0.4514215588569641, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.12074156850576401, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1069.384033203125, + "completions/mean_terminated_length": 826.7743530273438, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7072612007884502, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11253983843588625, + "kl": 0.025604248046875, + "learning_rate": 3.1559678341609585e-07, + "loss": 0.0824, + "num_tokens": 1830816971.0, + "reward": 2.4458706378936768, + "reward_std": 0.4225183427333832, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1247187927365303, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 984.4732666015625, + "completions/mean_terminated_length": 797.4487915039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7074742954557562, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12582896554687586, + "kl": 0.025787353515625, + "learning_rate": 3.1531108533465557e-07, + "loss": 0.0502, + "num_tokens": 1831336399.0, + "reward": 2.3214287757873535, + "reward_std": 0.495141863822937, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1567276567220688, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1029.321533203125, + "completions/mean_terminated_length": 801.0928955078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7076873901230621, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12302125028672911, + "kl": 0.026580810546875, + "learning_rate": 3.150255171355656e-07, + "loss": 0.0587, + "num_tokens": 1831873087.0, + "reward": 2.4034600257873535, + "reward_std": 0.47603335976600647, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12381463497877121, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1064.8348388671875, + "completions/mean_terminated_length": 789.548583984375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7079004847903682, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12348766134258168, + "kl": 0.023284912109375, + "learning_rate": 3.1474007897686615e-07, + "loss": 0.0515, + "num_tokens": 1832427493.0, + "reward": 2.4268975257873535, + "reward_std": 0.40408816933631897, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10681799054145813, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 888.7254638671875, + "completions/mean_terminated_length": 709.4561767578125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.7081135794576741, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15826759025383902, + "kl": 0.032196044921875, + "learning_rate": 3.144547710165254e-07, + "loss": 0.1183, + "num_tokens": 1832894810.0, + "reward": 2.509486675262451, + "reward_std": 0.40651994943618774, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11395422369241714, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1017.6563110351562, + "completions/mean_terminated_length": 776.3911743164062, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7083266741249801, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10936487406565053, + "kl": 0.02777099609375, + "learning_rate": 3.141695934124392e-07, + "loss": 0.0915, + "num_tokens": 1833419456.0, + "reward": 2.54296875, + "reward_std": 0.4157126545906067, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1328611671924591, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 972.57373046875, + "completions/mean_terminated_length": 773.4205932617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.708539768792286, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13087561067421552, + "kl": 0.0283203125, + "learning_rate": 3.1388454632243217e-07, + "loss": 0.0559, + "num_tokens": 1833923233.0, + "reward": 2.3934152126312256, + "reward_std": 0.4466537833213806, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1280086487531662, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 967.43310546875, + "completions/mean_terminated_length": 767.3280029296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7087528634595919, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.21556822016114494, + "kl": 0.028533935546875, + "learning_rate": 3.1359962990425546e-07, + "loss": 0.0913, + "num_tokens": 1834433155.0, + "reward": 2.4536831378936768, + "reward_std": 0.4137965440750122, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.146371990442276, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1018.3951416015625, + "completions/mean_terminated_length": 811.3699951171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7089659581268979, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12600631662270473, + "kl": 0.027984619140625, + "learning_rate": 3.1331484431558853e-07, + "loss": 0.0762, + "num_tokens": 1834964116.0, + "reward": 2.4481027126312256, + "reward_std": 0.46774420142173767, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13826683163642883, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1084.8638916015625, + "completions/mean_terminated_length": 829.1158447265625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7091790527942038, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13520939512821195, + "kl": 0.02410888671875, + "learning_rate": 3.130301897140387e-07, + "loss": 0.0938, + "num_tokens": 1835523687.0, + "reward": 2.3861608505249023, + "reward_std": 0.43371132016181946, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12935835123062134, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1055.9085693359375, + "completions/mean_terminated_length": 803.0223999023438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7093921474615098, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10952494960144282, + "kl": 0.023529052734375, + "learning_rate": 3.127456662571405e-07, + "loss": 0.0324, + "num_tokens": 1836064350.0, + "reward": 2.385044813156128, + "reward_std": 0.34014779329299927, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.10442600399255753, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1059.747802734375, + "completions/mean_terminated_length": 838.3360595703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7096052421288157, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12125403815746828, + "kl": 0.026153564453125, + "learning_rate": 3.1246127410235556e-07, + "loss": 0.058, + "num_tokens": 1836606973.0, + "reward": 2.4012277126312256, + "reward_std": 0.4782784581184387, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.150824636220932, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1013.5469360351562, + "completions/mean_terminated_length": 792.07861328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7098183367961217, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1420265675907378, + "kl": 0.0286865234375, + "learning_rate": 3.1217701340707334e-07, + "loss": 0.0812, + "num_tokens": 1837125378.0, + "reward": 2.454799175262451, + "reward_std": 0.3740387260913849, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.14072787761688232, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 967.1495971679688, + "completions/mean_terminated_length": 717.7225341796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7100314314634276, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12889704686773856, + "kl": 0.029541015625, + "learning_rate": 3.1189288432861056e-07, + "loss": 0.1189, + "num_tokens": 1837623157.0, + "reward": 2.4793527126312256, + "reward_std": 0.49332547187805176, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12381463497877121, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1026.1763916015625, + "completions/mean_terminated_length": 797.2431640625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7102445261307336, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13855725035464447, + "kl": 0.02685546875, + "learning_rate": 3.1160888702421086e-07, + "loss": 0.0884, + "num_tokens": 1838151540.0, + "reward": 2.4654018878936768, + "reward_std": 0.39011961221694946, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12975822389125824, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1190.8326416015625, + "completions/mean_terminated_length": 918.555908203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7104576207980395, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10397632559255922, + "kl": 0.0224609375, + "learning_rate": 3.1132502165104457e-07, + "loss": 0.0566, + "num_tokens": 1838754009.0, + "reward": 2.357701063156128, + "reward_std": 0.4094166159629822, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 931.6964721679688, + "completions/mean_terminated_length": 735.3910522460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7106707154653454, + "frac_reward_zero_std": 0.3214285969734192, + "grad_norm": 0.12226213723928774, + "kl": 0.02728271484375, + "learning_rate": 3.1104128836621e-07, + "loss": 0.0777, + "num_tokens": 1839246289.0, + "reward": 2.33984375, + "reward_std": 0.3604331314563751, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1354389488697052, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 959.7388916015625, + "completions/mean_terminated_length": 816.8358764648438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7108838101326515, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1390103376987811, + "kl": 0.029144287109375, + "learning_rate": 3.1075768732673156e-07, + "loss": 0.1006, + "num_tokens": 1839742092.0, + "reward": 2.587611675262451, + "reward_std": 0.43270811438560486, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.46923142671585083, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1049.9754638671875, + "completions/mean_terminated_length": 799.0753784179688, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7110969047999574, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12199422006775361, + "kl": 0.024749755859375, + "learning_rate": 3.104742186895608e-07, + "loss": 0.0704, + "num_tokens": 1840279649.0, + "reward": 2.431919813156128, + "reward_std": 0.361887663602829, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493200302124, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11055814474821091, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1100.21875, + "completions/mean_terminated_length": 795.4749145507812, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7113099994672634, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11430266827547936, + "kl": 0.02532958984375, + "learning_rate": 3.101908826115758e-07, + "loss": 0.029, + "num_tokens": 1840840419.0, + "reward": 2.3828125, + "reward_std": 0.3795602023601532, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14601868391036987, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 916.65185546875, + "completions/mean_terminated_length": 741.7009887695312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7115230941345693, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14603279244430828, + "kl": 0.03070068359375, + "learning_rate": 3.0990767924958133e-07, + "loss": 0.0949, + "num_tokens": 1841321335.0, + "reward": 2.5345983505249023, + "reward_std": 0.4367714822292328, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12177752703428268, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 984.154052734375, + "completions/mean_terminated_length": 752.8831787109375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7117361888018753, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.142662817238777, + "kl": 0.028106689453125, + "learning_rate": 3.0962460876030903e-07, + "loss": 0.0845, + "num_tokens": 1841834252.0, + "reward": 2.5340402126312256, + "reward_std": 0.44301551580429077, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12062777578830719, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 975.6272583007812, + "completions/mean_terminated_length": 790.3482055664062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7119492834691812, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13259397625033023, + "kl": 0.02679443359375, + "learning_rate": 3.0934167130041666e-07, + "loss": 0.1063, + "num_tokens": 1842345829.0, + "reward": 2.4927456378936768, + "reward_std": 0.47583070397377014, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1377149522304535, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1031.607177734375, + "completions/mean_terminated_length": 779.63232421875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.7121623781364871, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11652767259003653, + "kl": 0.025238037109375, + "learning_rate": 3.090588670264883e-07, + "loss": 0.0551, + "num_tokens": 1842872933.0, + "reward": 2.4575893878936768, + "reward_std": 0.45661118626594543, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12672585248947144, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 895.1428833007812, + "completions/mean_terminated_length": 720.2879028320312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7123754728037931, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13516968570754018, + "kl": 0.030181884765625, + "learning_rate": 3.087761960950345e-07, + "loss": 0.0084, + "num_tokens": 1843335365.0, + "reward": 2.4425225257873535, + "reward_std": 0.34387677907943726, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11395422369241714, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1017.4219360351562, + "completions/mean_terminated_length": 826.5740356445312, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.712588567471099, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11950758731982991, + "kl": 0.0264892578125, + "learning_rate": 3.0849365866249233e-07, + "loss": 0.0719, + "num_tokens": 1843861746.0, + "reward": 2.5284600257873535, + "reward_std": 0.3759201169013977, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09830980002880096, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 959.2210083007812, + "completions/mean_terminated_length": 800.4987182617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.712801662138405, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1235019831248256, + "kl": 0.030181884765625, + "learning_rate": 3.0821125488522426e-07, + "loss": 0.0744, + "num_tokens": 1844355301.0, + "reward": 2.345982313156128, + "reward_std": 0.3601638376712799, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12951265275478363, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 999.8638916015625, + "completions/mean_terminated_length": 754.4324951171875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.7130147568057109, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14013754790379926, + "kl": 0.0279541015625, + "learning_rate": 3.07928984919519e-07, + "loss": 0.0855, + "num_tokens": 1844863704.0, + "reward": 2.4799108505249023, + "reward_std": 0.36846476793289185, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14040927588939667, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1002.450927734375, + "completions/mean_terminated_length": 754.060791015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7132278514730169, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.16233568333649878, + "kl": 0.027191162109375, + "learning_rate": 3.076468489215919e-07, + "loss": 0.0865, + "num_tokens": 1845382674.0, + "reward": 2.3861608505249023, + "reward_std": 0.40563100576400757, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13318143784999847, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 973.5313110351562, + "completions/mean_terminated_length": 804.1705932617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7134409461403228, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13222379850792534, + "kl": 0.026824951171875, + "learning_rate": 3.0736484704758327e-07, + "loss": 0.0479, + "num_tokens": 1845883760.0, + "reward": 2.4676339626312256, + "reward_std": 0.4564989507198334, + "rewards/accuracy_reward/mean": 0.5925925970077515, + "rewards/accuracy_reward/std": 0.49192148447036743, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559476852417, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1020.9308471679688, + "completions/mean_terminated_length": 849.7526245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7136540408076288, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13173485116039826, + "kl": 0.0260009765625, + "learning_rate": 3.0708297945355975e-07, + "loss": 0.0802, + "num_tokens": 1846419873.0, + "reward": 2.4402902126312256, + "reward_std": 0.4642750322818756, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12848563492298126, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1059.185302734375, + "completions/mean_terminated_length": 767.6849365234375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7138671354749347, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1325594760609868, + "kl": 0.0272216796875, + "learning_rate": 3.068012462955133e-07, + "loss": 0.0741, + "num_tokens": 1846964692.0, + "reward": 2.412388563156128, + "reward_std": 0.48228099942207336, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15266746282577515, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1051.884033203125, + "completions/mean_terminated_length": 811.8226928710938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7140802301422406, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1326909796334596, + "kl": 0.026336669921875, + "learning_rate": 3.065196477293616e-07, + "loss": 0.0685, + "num_tokens": 1847508160.0, + "reward": 2.3671875, + "reward_std": 0.5420317053794861, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3310886323451996, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16511768102645874, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 977.0089721679688, + "completions/mean_terminated_length": 785.35791015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7142933248095467, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1250874401117488, + "kl": 0.029693603515625, + "learning_rate": 3.0623818391094846e-07, + "loss": 0.0604, + "num_tokens": 1848013604.0, + "reward": 2.427455425262451, + "reward_std": 0.4641398787498474, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16796617209911346, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1017.8527221679688, + "completions/mean_terminated_length": 769.5900268554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7145064194768526, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13307198861320274, + "kl": 0.028167724609375, + "learning_rate": 3.0595685499604176e-07, + "loss": 0.0541, + "num_tokens": 1848530898.0, + "reward": 2.4095983505249023, + "reward_std": 0.3591146767139435, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11709482222795486, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 986.9397583007812, + "completions/mean_terminated_length": 780.3866577148438, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.7147195141441586, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1343722109909295, + "kl": 0.027099609375, + "learning_rate": 3.0567566114033605e-07, + "loss": 0.0875, + "num_tokens": 1849042599.0, + "reward": 2.4363839626312256, + "reward_std": 0.4808584153652191, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16906259953975677, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 990.3817138671875, + "completions/mean_terminated_length": 770.8759765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7149326088114645, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11637771505213464, + "kl": 0.0250244140625, + "learning_rate": 3.053946024994505e-07, + "loss": 0.0784, + "num_tokens": 1849551602.0, + "reward": 2.4720983505249023, + "reward_std": 0.4218716323375702, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1070.21875, + "completions/mean_terminated_length": 800.0056762695312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7151457034787705, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1181530438472655, + "kl": 0.0264892578125, + "learning_rate": 3.0511367922892957e-07, + "loss": 0.0712, + "num_tokens": 1850101636.0, + "reward": 2.3331475257873535, + "reward_std": 0.525721549987793, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16657331585884094, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1026.08935546875, + "completions/mean_terminated_length": 776.2888793945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7153587981460764, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12344795420859954, + "kl": 0.025238037109375, + "learning_rate": 3.048328914842426e-07, + "loss": 0.0457, + "num_tokens": 1850641276.0, + "reward": 2.353794813156128, + "reward_std": 0.4471887946128845, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005797147750854, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1728886514902115, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1081.3951416015625, + "completions/mean_terminated_length": 864.8333129882812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7155718928133823, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14333011356686176, + "kl": 0.027801513671875, + "learning_rate": 3.0455223942078424e-07, + "loss": 0.0532, + "num_tokens": 1851200621.0, + "reward": 2.4464287757873535, + "reward_std": 0.40338632464408875, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1041.513427734375, + "completions/mean_terminated_length": 822.7119750976562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7157849874806883, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10878182717875556, + "kl": 0.024688720703125, + "learning_rate": 3.0427172319387405e-07, + "loss": 0.0347, + "num_tokens": 1851737107.0, + "reward": 2.4838171005249023, + "reward_std": 0.4349047541618347, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15174883604049683, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1005.8125610351562, + "completions/mean_terminated_length": 782.6883544921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7159980821479942, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13538172255775882, + "kl": 0.028472900390625, + "learning_rate": 3.0399134295875607e-07, + "loss": 0.0596, + "num_tokens": 1852259695.0, + "reward": 2.3738839626312256, + "reward_std": 0.42971986532211304, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.131288543343544, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 948.263427734375, + "completions/mean_terminated_length": 797.5380249023438, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7162111768153002, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12632024327275226, + "kl": 0.0279541015625, + "learning_rate": 3.0371109887059954e-07, + "loss": 0.0777, + "num_tokens": 1852752245.0, + "reward": 2.6065850257873535, + "reward_std": 0.4649077355861664, + "rewards/accuracy_reward/mean": 0.7232142686843872, + "rewards/accuracy_reward/std": 0.44790980219841003, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14557664096355438, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1013.1183471679688, + "completions/mean_terminated_length": 801.6908569335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7164242714826061, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1258924157932152, + "kl": 0.02508544921875, + "learning_rate": 3.034309910844979e-07, + "loss": 0.0759, + "num_tokens": 1853277898.0, + "reward": 2.3900671005249023, + "reward_std": 0.45095714926719666, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14936909079551697, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1102.404052734375, + "completions/mean_terminated_length": 884.1895751953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7166373661499121, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13232439371994847, + "kl": 0.0255126953125, + "learning_rate": 3.0315101975546953e-07, + "loss": 0.0885, + "num_tokens": 1853850143.0, + "reward": 2.330357313156128, + "reward_std": 0.5085436701774597, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.1634649783372879, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 956.6563110351562, + "completions/mean_terminated_length": 787.8917236328125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.716850460817218, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12088820622748457, + "kl": 0.02862548828125, + "learning_rate": 3.02871185038457e-07, + "loss": 0.0591, + "num_tokens": 1854348933.0, + "reward": 2.4408483505249023, + "reward_std": 0.4369667172431946, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709372282028198, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 957.4866333007812, + "completions/mean_terminated_length": 748.6648559570312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.717063555484524, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1509645566878022, + "kl": 0.02838134765625, + "learning_rate": 3.0259148708832717e-07, + "loss": 0.1007, + "num_tokens": 1854843807.0, + "reward": 2.4871652126312256, + "reward_std": 0.4313191771507263, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4846842288970947, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.1654377579689026, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 854.2879638671875, + "completions/mean_terminated_length": 669.6932983398438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.71727665015183, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13238180384958176, + "kl": 0.031219482421875, + "learning_rate": 3.023119260598721e-07, + "loss": 0.0717, + "num_tokens": 1855297440.0, + "reward": 2.5474331378936768, + "reward_std": 0.35852569341659546, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11682131141424179, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 915.9285888671875, + "completions/mean_terminated_length": 706.2857055664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7174897448191359, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1480645840062063, + "kl": 0.031982421875, + "learning_rate": 3.020325021078069e-07, + "loss": 0.12, + "num_tokens": 1855772320.0, + "reward": 2.5306921005249023, + "reward_std": 0.4434750974178314, + "rewards/accuracy_reward/mean": 0.6574074029922485, + "rewards/accuracy_reward/std": 0.4751267731189728, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15343420207500458, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1013.6451416015625, + "completions/mean_terminated_length": 774.9478149414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7177028394864419, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13590651085789632, + "kl": 0.027191162109375, + "learning_rate": 3.017532153867716e-07, + "loss": 0.1053, + "num_tokens": 1856299233.0, + "reward": 2.419642925262451, + "reward_std": 0.4332006573677063, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14714714884757996, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 964.8527221679688, + "completions/mean_terminated_length": 767.6570434570312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7179159341537478, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11508799739620248, + "kl": 0.02679443359375, + "learning_rate": 3.014740660513298e-07, + "loss": 0.0679, + "num_tokens": 1856791231.0, + "reward": 2.3582589626312256, + "reward_std": 0.3488439917564392, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15391045808792114, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 952.0803833007812, + "completions/mean_terminated_length": 814.4019775390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7181290288210538, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13211528006286527, + "kl": 0.0284423828125, + "learning_rate": 3.0119505425596926e-07, + "loss": 0.04, + "num_tokens": 1857290067.0, + "reward": 2.5161831378936768, + "reward_std": 0.39218929409980774, + "rewards/accuracy_reward/mean": 0.6180555820465088, + "rewards/accuracy_reward/std": 0.48642635345458984, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.09263478219509125, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1046.046875, + "completions/mean_terminated_length": 790.6470947265625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7183421234883597, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13010871659163667, + "kl": 0.025665283203125, + "learning_rate": 3.009161801551022e-07, + "loss": 0.0769, + "num_tokens": 1857825448.0, + "reward": 2.3872768878936768, + "reward_std": 0.48215585947036743, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1464626044034958, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 871.2924194335938, + "completions/mean_terminated_length": 710.0177612304688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7185552181556657, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.148438393209212, + "kl": 0.0308837890625, + "learning_rate": 3.006374439030633e-07, + "loss": 0.0979, + "num_tokens": 1858279643.0, + "reward": 2.533482313156128, + "reward_std": 0.4409452974796295, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.15048591792583466, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 986.7813110351562, + "completions/mean_terminated_length": 793.577880859375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7187683128229716, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12587387667349265, + "kl": 0.027984619140625, + "learning_rate": 3.003588456541123e-07, + "loss": 0.0864, + "num_tokens": 1858787449.0, + "reward": 2.4598214626312256, + "reward_std": 0.41850295662879944, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13526484370231628, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1104.102783203125, + "completions/mean_terminated_length": 750.864990234375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.7189814074902775, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12169853656544591, + "kl": 0.027557373046875, + "learning_rate": 3.000803855624318e-07, + "loss": 0.0349, + "num_tokens": 1859352151.0, + "reward": 2.2533483505249023, + "reward_std": 0.4910773038864136, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.18940123915672302, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 794.0670166015625, + "completions/mean_terminated_length": 643.594970703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7191945021575835, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1574005011889106, + "kl": 0.03558349609375, + "learning_rate": 2.9980206378212824e-07, + "loss": 0.0756, + "num_tokens": 1859777893.0, + "reward": 2.6356027126312256, + "reward_std": 0.42525947093963623, + "rewards/accuracy_reward/mean": 0.7321428656578064, + "rewards/accuracy_reward/std": 0.4433377981185913, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.15223345160484314, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 928.15185546875, + "completions/mean_terminated_length": 731.2230834960938, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7194075968248894, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1450262779853937, + "kl": 0.03265380859375, + "learning_rate": 2.9952388046723133e-07, + "loss": 0.1113, + "num_tokens": 1860260185.0, + "reward": 2.568080425262451, + "reward_std": 0.40151655673980713, + "rewards/accuracy_reward/mean": 0.6808035969734192, + "rewards/accuracy_reward/std": 0.4666863977909088, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14601868391036987, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 999.7835083007812, + "completions/mean_terminated_length": 818.6780395507812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7196206914921954, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.128861760934775, + "kl": 0.0272216796875, + "learning_rate": 2.9924583577169404e-07, + "loss": 0.0517, + "num_tokens": 1860780040.0, + "reward": 2.4620537757873535, + "reward_std": 0.3908250629901886, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11967316269874573, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 963.0067138671875, + "completions/mean_terminated_length": 772.2073364257812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7198337861595013, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14201917937765637, + "kl": 0.02947998046875, + "learning_rate": 2.9896792984939346e-07, + "loss": 0.1067, + "num_tokens": 1861276075.0, + "reward": 2.4642858505249023, + "reward_std": 0.4351259171962738, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16231536865234375, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1013.5692138671875, + "completions/mean_terminated_length": 805.57373046875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.7200468808268073, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13663790131044107, + "kl": 0.028900146484375, + "learning_rate": 2.9869016285412853e-07, + "loss": 0.0597, + "num_tokens": 1861798042.0, + "reward": 2.4754464626312256, + "reward_std": 0.47384026646614075, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12040118128061295, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1054.388427734375, + "completions/mean_terminated_length": 848.1671142578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7202599754941132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1348566846109887, + "kl": 0.0283203125, + "learning_rate": 2.9841253493962235e-07, + "loss": 0.0794, + "num_tokens": 1862341896.0, + "reward": 2.470424175262451, + "reward_std": 0.4618629813194275, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416089117527008, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1022.7522583007812, + "completions/mean_terminated_length": 772.1361083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7204730701614193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3725021170002289, + "kl": 0.027496337890625, + "learning_rate": 2.981350462595209e-07, + "loss": 0.046, + "num_tokens": 1862873593.0, + "reward": 2.411830425262451, + "reward_std": 0.40283289551734924, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11141301691532135, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 927.5938110351562, + "completions/mean_terminated_length": 750.9922485351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7206861648287252, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1437815809805686, + "kl": 0.0301513671875, + "learning_rate": 2.978576969673926e-07, + "loss": 0.1365, + "num_tokens": 1863362147.0, + "reward": 2.431919813156128, + "reward_std": 0.4461188316345215, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9520089030265808, + "rewards/tag_count_reward/std": 0.17438411712646484, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 993.4754638671875, + "completions/mean_terminated_length": 753.679443359375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7208992594960311, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13574330888021796, + "kl": 0.027069091796875, + "learning_rate": 2.9758048721672917e-07, + "loss": 0.068, + "num_tokens": 1863875160.0, + "reward": 2.392857313156128, + "reward_std": 0.3938256800174713, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 929.0826416015625, + "completions/mean_terminated_length": 718.3580932617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7211123541633371, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14170788217378083, + "kl": 0.027801513671875, + "learning_rate": 2.973034171609449e-07, + "loss": 0.1309, + "num_tokens": 1864355453.0, + "reward": 2.5200893878936768, + "reward_std": 0.4428699016571045, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14140157401561737, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1085.997802734375, + "completions/mean_terminated_length": 854.1578979492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.721325448830643, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10269392190679648, + "kl": 0.025238037109375, + "learning_rate": 2.970264869533771e-07, + "loss": 0.0354, + "num_tokens": 1864915468.0, + "reward": 2.3364956378936768, + "reward_std": 0.3664836585521698, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.09722442179918289, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 895.560302734375, + "completions/mean_terminated_length": 713.9096069335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.721538543497949, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14456626840655323, + "kl": 0.03448486328125, + "learning_rate": 2.9674969674728546e-07, + "loss": 0.0742, + "num_tokens": 1865381751.0, + "reward": 2.53515625, + "reward_std": 0.42612650990486145, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416090607643127, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 926.4866333007812, + "completions/mean_terminated_length": 782.4130859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7217516381652549, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12529746810743825, + "kl": 0.029388427734375, + "learning_rate": 2.964730466958517e-07, + "loss": 0.0678, + "num_tokens": 1865870545.0, + "reward": 2.5027902126312256, + "reward_std": 0.37594395875930786, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13485699892044067, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 928.9263916015625, + "completions/mean_terminated_length": 678.2048950195312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7219647328325609, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12656152216409805, + "kl": 0.0311279296875, + "learning_rate": 2.961965369521809e-07, + "loss": 0.0661, + "num_tokens": 1866350976.0, + "reward": 2.46875, + "reward_std": 0.3738847076892853, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1039.493408203125, + "completions/mean_terminated_length": 764.446044921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7221778274998668, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11924420041602478, + "kl": 0.027679443359375, + "learning_rate": 2.9592016766929996e-07, + "loss": 0.0268, + "num_tokens": 1866883437.0, + "reward": 2.400111675262451, + "reward_std": 0.4034498929977417, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14921021461486816, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 917.7388916015625, + "completions/mean_terminated_length": 739.583984375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7223909221671727, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1374622423647146, + "kl": 0.030792236328125, + "learning_rate": 2.95643939000158e-07, + "loss": 0.078, + "num_tokens": 1867367832.0, + "reward": 2.4988839626312256, + "reward_std": 0.34756582975387573, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.08887429535388947, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 981.0647583007812, + "completions/mean_terminated_length": 727.5939331054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7226040168344787, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1314783628689341, + "kl": 0.029571533203125, + "learning_rate": 2.953678510976265e-07, + "loss": 0.0069, + "num_tokens": 1867878853.0, + "reward": 2.4347100257873535, + "reward_std": 0.3679303824901581, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14323382079601288, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1042.930908203125, + "completions/mean_terminated_length": 814.380859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7228171115017846, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.20629654114684143, + "kl": 0.0592041015625, + "learning_rate": 2.950919041144988e-07, + "loss": 0.069, + "num_tokens": 1868420742.0, + "reward": 2.443638563156128, + "reward_std": 0.3858453333377838, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09244592487812042, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 870.9598388671875, + "completions/mean_terminated_length": 719.7531127929688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7230302061690906, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15618052955148484, + "kl": 0.031829833984375, + "learning_rate": 2.9481609820349085e-07, + "loss": 0.0788, + "num_tokens": 1868881796.0, + "reward": 2.4799108505249023, + "reward_std": 0.37334492802619934, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11531686037778854, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1067.6317138671875, + "completions/mean_terminated_length": 796.7036743164062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7232433008363965, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12719264167274025, + "kl": 0.028472900390625, + "learning_rate": 2.945404335172399e-07, + "loss": 0.0709, + "num_tokens": 1869428639.0, + "reward": 2.4051339626312256, + "reward_std": 0.4426869750022888, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14457522332668304, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 973.0692138671875, + "completions/mean_terminated_length": 780.7131958007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7234563955037026, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13510886682063347, + "kl": 0.0284423828125, + "learning_rate": 2.942649102083051e-07, + "loss": 0.0985, + "num_tokens": 1869935102.0, + "reward": 2.5552456378936768, + "reward_std": 0.45971906185150146, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969305515289307, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1475696861743927, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1137.5, + "completions/mean_terminated_length": 811.92724609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7236694901710085, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11903697103639525, + "kl": 0.02264404296875, + "learning_rate": 2.939895284291677e-07, + "loss": 0.0573, + "num_tokens": 1870516462.0, + "reward": 2.3861608505249023, + "reward_std": 0.3681625723838806, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09910815209150314, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1004.62060546875, + "completions/mean_terminated_length": 781.2412109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7238825848383145, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1373640118790415, + "kl": 0.028656005859375, + "learning_rate": 2.9371428833223056e-07, + "loss": 0.0539, + "num_tokens": 1871031524.0, + "reward": 2.5200893878936768, + "reward_std": 0.43982771039009094, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14816173911094666, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 981.5625610351562, + "completions/mean_terminated_length": 756.7459716796875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7240956795056204, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1359291534914912, + "kl": 0.02740478515625, + "learning_rate": 2.9343919006981763e-07, + "loss": 0.0797, + "num_tokens": 1871541424.0, + "reward": 2.4654018878936768, + "reward_std": 0.38752931356430054, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1401556432247162, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 952.2678833007812, + "completions/mean_terminated_length": 776.2694091796875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7243087741729263, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14431950148324868, + "kl": 0.031097412109375, + "learning_rate": 2.931642337941749e-07, + "loss": 0.098, + "num_tokens": 1872036328.0, + "reward": 2.46875, + "reward_std": 0.42768269777297974, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15220165252685547, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1067.9598388671875, + "completions/mean_terminated_length": 814.6910400390625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.7245218688402323, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13226005065580296, + "kl": 0.0242919921875, + "learning_rate": 2.928894196574697e-07, + "loss": 0.121, + "num_tokens": 1872583670.0, + "reward": 2.4838171005249023, + "reward_std": 0.4587395191192627, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661914110183716, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 835.1763916015625, + "completions/mean_terminated_length": 682.8115234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7247349635075382, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12861472321939502, + "kl": 0.029632568359375, + "learning_rate": 2.9261474781179075e-07, + "loss": 0.0461, + "num_tokens": 1873029493.0, + "reward": 2.493861675262451, + "reward_std": 0.38119107484817505, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1354389488697052, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 895.3839721679688, + "completions/mean_terminated_length": 671.0079956054688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7249480581748442, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14006947485055238, + "kl": 0.031707763671875, + "learning_rate": 2.923402184091476e-07, + "loss": 0.0811, + "num_tokens": 1873491713.0, + "reward": 2.5005581378936768, + "reward_std": 0.4125838577747345, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1514935940504074, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1052.493408203125, + "completions/mean_terminated_length": 822.760986328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7251611528421501, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12464653839890166, + "kl": 0.0294189453125, + "learning_rate": 2.9206583160147133e-07, + "loss": 0.0461, + "num_tokens": 1874028174.0, + "reward": 2.310826063156128, + "reward_std": 0.4906337857246399, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.1611565798521042, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 983.6607666015625, + "completions/mean_terminated_length": 789.8892211914062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7253742475094561, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14022095706034168, + "kl": 0.030426025390625, + "learning_rate": 2.9179158754061405e-07, + "loss": 0.0727, + "num_tokens": 1874539398.0, + "reward": 2.484375, + "reward_std": 0.40690091252326965, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13686132431030273, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 898.2366333007812, + "completions/mean_terminated_length": 750.5339965820312, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.725587342176762, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12909018496604843, + "kl": 0.029815673828125, + "learning_rate": 2.9151748637834923e-07, + "loss": 0.1061, + "num_tokens": 1875005376.0, + "reward": 2.6356027126312256, + "reward_std": 0.3758071959018707, + "rewards/accuracy_reward/mean": 0.7209821343421936, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13519908487796783, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1033.477783203125, + "completions/mean_terminated_length": 799.357177734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.725800436844068, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11217433270572724, + "kl": 0.026519775390625, + "learning_rate": 2.9124352826637015e-07, + "loss": 0.0372, + "num_tokens": 1875540534.0, + "reward": 2.4676339626312256, + "reward_std": 0.42221924662590027, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12848198413848877, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1137.5648193359375, + "completions/mean_terminated_length": 855.383056640625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.7260135315113739, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10470909152618864, + "kl": 0.023773193359375, + "learning_rate": 2.9096971335629227e-07, + "loss": 0.0782, + "num_tokens": 1876113763.0, + "reward": 2.361049175262451, + "reward_std": 0.38826024532318115, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1489589959383011, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1032.3192138671875, + "completions/mean_terminated_length": 791.02490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7262266261786798, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.12516592055488948, + "kl": 0.02801513671875, + "learning_rate": 2.906960417996509e-07, + "loss": 0.0507, + "num_tokens": 1876648338.0, + "reward": 2.4174108505249023, + "reward_std": 0.3114190101623535, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1299937218427658, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1077.091552734375, + "completions/mean_terminated_length": 812.29833984375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7264397208459858, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13383448456698407, + "kl": 0.024749755859375, + "learning_rate": 2.904225137479024e-07, + "loss": 0.1346, + "num_tokens": 1877194171.0, + "reward": 2.349888563156128, + "reward_std": 0.4980581998825073, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14275364577770233, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 974.13623046875, + "completions/mean_terminated_length": 771.8965454101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7266528155132917, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14558345691938124, + "kl": 0.028564453125, + "learning_rate": 2.901491293524236e-07, + "loss": 0.1216, + "num_tokens": 1877703736.0, + "reward": 2.470982313156128, + "reward_std": 0.45858171582221985, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16402913630008698, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 946.74560546875, + "completions/mean_terminated_length": 792.6259765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7268659101805978, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11547998785104664, + "kl": 0.029693603515625, + "learning_rate": 2.898758887645116e-07, + "loss": 0.0622, + "num_tokens": 1878199798.0, + "reward": 2.4871652126312256, + "reward_std": 0.40343886613845825, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15641570091247559, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1051.477783203125, + "completions/mean_terminated_length": 818.1322631835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7270790048479037, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12519129143993266, + "kl": 0.026947021484375, + "learning_rate": 2.8960279213538463e-07, + "loss": 0.097, + "num_tokens": 1878739500.0, + "reward": 2.4698662757873535, + "reward_std": 0.4294077455997467, + "rewards/accuracy_reward/mean": 0.5763888955116272, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13189572095870972, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 957.9710083007812, + "completions/mean_terminated_length": 731.738525390625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7272920995152097, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14688524122295993, + "kl": 0.0283203125, + "learning_rate": 2.893298396161805e-07, + "loss": 0.113, + "num_tokens": 1879241039.0, + "reward": 2.3521206378936768, + "reward_std": 0.48625805974006653, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.17125900089740753, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1065.296875, + "completions/mean_terminated_length": 793.7236328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7275051941825156, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12837300590149925, + "kl": 0.027923583984375, + "learning_rate": 2.8905703135795745e-07, + "loss": 0.0906, + "num_tokens": 1879791956.0, + "reward": 2.256138563156128, + "reward_std": 0.4179733097553253, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14801737666130066, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 896.4866333007812, + "completions/mean_terminated_length": 704.5677490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7277182888498215, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14254997194340524, + "kl": 0.02886962890625, + "learning_rate": 2.887843675116941e-07, + "loss": 0.1022, + "num_tokens": 1880261758.0, + "reward": 2.4308037757873535, + "reward_std": 0.3681824505329132, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11923423409461975, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1101.41748046875, + "completions/mean_terminated_length": 836.374267578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7279313835171275, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11386296398519406, + "kl": 0.025726318359375, + "learning_rate": 2.88511848228289e-07, + "loss": 0.0686, + "num_tokens": 1880824409.0, + "reward": 2.4190850257873535, + "reward_std": 0.422359824180603, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15667887032032013, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 975.5245971679688, + "completions/mean_terminated_length": 728.0302124023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7281444781844334, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12697691856832877, + "kl": 0.030242919921875, + "learning_rate": 2.8823947365856064e-07, + "loss": 0.0405, + "num_tokens": 1881323364.0, + "reward": 2.400111675262451, + "reward_std": 0.3837801516056061, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151519536972046, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 945.30810546875, + "completions/mean_terminated_length": 758.1671142578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7283575728517394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.332477326817764, + "kl": 0.04132080078125, + "learning_rate": 2.879672439532474e-07, + "loss": 0.0984, + "num_tokens": 1881811918.0, + "reward": 2.4095983505249023, + "reward_std": 0.501810610294342, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.15942463278770447, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 976.9710083007812, + "completions/mean_terminated_length": 771.8803100585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7285706675190453, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12300772630348249, + "kl": 0.029052734375, + "learning_rate": 2.876951592630079e-07, + "loss": 0.0712, + "num_tokens": 1882318913.0, + "reward": 2.407924175262451, + "reward_std": 0.3882731795310974, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14323382079601288, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1013.700927734375, + "completions/mean_terminated_length": 818.9124755859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7287837621863513, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12796250893393432, + "kl": 0.028717041015625, + "learning_rate": 2.874232197384201e-07, + "loss": 0.0501, + "num_tokens": 1882837739.0, + "reward": 2.46875, + "reward_std": 0.3944890797138214, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1049.904052734375, + "completions/mean_terminated_length": 816.1901245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7289968568536572, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12971459548470377, + "kl": 0.0279541015625, + "learning_rate": 2.871514255299815e-07, + "loss": 0.0406, + "num_tokens": 1883380672.0, + "reward": 2.36328125, + "reward_std": 0.46721577644348145, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16796153783798218, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 975.3594360351562, + "completions/mean_terminated_length": 806.286865234375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.7292099515209632, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11805064951570657, + "kl": 0.02899169921875, + "learning_rate": 2.8687977678810965e-07, + "loss": 0.0224, + "num_tokens": 1883886289.0, + "reward": 2.506138563156128, + "reward_std": 0.37737151980400085, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1040.8192138671875, + "completions/mean_terminated_length": 838.302978515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7294230461882691, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13589890123814963, + "kl": 0.026702880859375, + "learning_rate": 2.86608273663141e-07, + "loss": 0.0741, + "num_tokens": 1884425792.0, + "reward": 2.3448662757873535, + "reward_std": 0.40496307611465454, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.13064035773277283, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 926.4263916015625, + "completions/mean_terminated_length": 700.9088745117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.729636140855575, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13125399988346317, + "kl": 0.03070068359375, + "learning_rate": 2.863369163053323e-07, + "loss": 0.0688, + "num_tokens": 1884915807.0, + "reward": 2.4810268878936768, + "reward_std": 0.36501941084861755, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.11662477999925613, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 944.6495971679688, + "completions/mean_terminated_length": 743.7757568359375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7298492355228811, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12733418155021453, + "kl": 0.029296875, + "learning_rate": 2.860657048648584e-07, + "loss": 0.059, + "num_tokens": 1885407538.0, + "reward": 2.4536831378936768, + "reward_std": 0.36568281054496765, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11109180748462677, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 920.37060546875, + "completions/mean_terminated_length": 781.8897094726562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.730062330190187, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11131642959655688, + "kl": 0.03009033203125, + "learning_rate": 2.8579463949181455e-07, + "loss": 0.0686, + "num_tokens": 1885886232.0, + "reward": 2.58203125, + "reward_std": 0.382823646068573, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13463464379310608, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1110.2723388671875, + "completions/mean_terminated_length": 819.631591796875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.730275424857493, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12969345318674996, + "kl": 0.02398681640625, + "learning_rate": 2.855237203362146e-07, + "loss": 0.0961, + "num_tokens": 1886457618.0, + "reward": 2.35546875, + "reward_std": 0.44602110981941223, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16145840287208557, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1061.3148193359375, + "completions/mean_terminated_length": 802.8309326171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7304885195247989, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1345681550978294, + "kl": 0.027679443359375, + "learning_rate": 2.8525294754799145e-07, + "loss": 0.1109, + "num_tokens": 1887007375.0, + "reward": 2.4341518878936768, + "reward_std": 0.49060893058776855, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15339045226573944, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 891.85498046875, + "completions/mean_terminated_length": 706.15283203125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7307016141921049, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13442112984228843, + "kl": 0.03326416015625, + "learning_rate": 2.8498232127699725e-07, + "loss": 0.0513, + "num_tokens": 1887472206.0, + "reward": 2.3643975257873535, + "reward_std": 0.39711326360702515, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1082.7523193359375, + "completions/mean_terminated_length": 826.4434814453125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7309147088594108, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10795708636100353, + "kl": 0.0240478515625, + "learning_rate": 2.8471184167300253e-07, + "loss": 0.0928, + "num_tokens": 1888020367.0, + "reward": 2.3392858505249023, + "reward_std": 0.43019357323646545, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1059.352783203125, + "completions/mean_terminated_length": 831.2033081054688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7311278035267167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11889543676058181, + "kl": 0.027008056640625, + "learning_rate": 2.844415088856978e-07, + "loss": 0.0393, + "num_tokens": 1888559565.0, + "reward": 2.4609375, + "reward_std": 0.43998026847839355, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12758491933345795, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1007.4888916015625, + "completions/mean_terminated_length": 814.8015747070312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7313408981940227, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12055097754386625, + "kl": 0.02777099609375, + "learning_rate": 2.8417132306469084e-07, + "loss": 0.063, + "num_tokens": 1889078920.0, + "reward": 2.4185268878936768, + "reward_std": 0.41142937541007996, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.11899843066930771, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1000.7366333007812, + "completions/mean_terminated_length": 711.3219604492188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7315539928613286, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.127692054687112, + "kl": 0.0277099609375, + "learning_rate": 2.8390128435950926e-07, + "loss": 0.0696, + "num_tokens": 1889601666.0, + "reward": 2.33203125, + "reward_std": 0.467107355594635, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.17351123690605164, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 925.5245971679688, + "completions/mean_terminated_length": 745.2305297851562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7317670875286346, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13244525952250213, + "kl": 0.026580810546875, + "learning_rate": 2.836313929195987e-07, + "loss": 0.0496, + "num_tokens": 1890088925.0, + "reward": 2.4542412757873535, + "reward_std": 0.3674367666244507, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12695714831352234, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1008.10498046875, + "completions/mean_terminated_length": 795.6531982421875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7319801821959405, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11900216531987744, + "kl": 0.0299072265625, + "learning_rate": 2.8336164889432323e-07, + "loss": 0.0281, + "num_tokens": 1890608620.0, + "reward": 2.49609375, + "reward_std": 0.39869406819343567, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13180458545684814, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1016.6763916015625, + "completions/mean_terminated_length": 785.61474609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7321932768632465, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12972400230234757, + "kl": 0.0277099609375, + "learning_rate": 2.830920524329658e-07, + "loss": 0.0873, + "num_tokens": 1891133083.0, + "reward": 2.4603796005249023, + "reward_std": 0.39249271154403687, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15853238105773926, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1072.046875, + "completions/mean_terminated_length": 769.5584716796875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7324063715305524, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1547349320933971, + "kl": 0.02862548828125, + "learning_rate": 2.828226036847271e-07, + "loss": 0.0957, + "num_tokens": 1891680960.0, + "reward": 2.310267925262451, + "reward_std": 0.47386637330055237, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.1620074063539505, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1125.075927734375, + "completions/mean_terminated_length": 849.5362548828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7326194661978584, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11588134102096417, + "kl": 0.02362060546875, + "learning_rate": 2.825533027987267e-07, + "loss": 0.0916, + "num_tokens": 1892252562.0, + "reward": 2.4503350257873535, + "reward_std": 0.4494834840297699, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14566238224506378, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1117.7835693359375, + "completions/mean_terminated_length": 893.6038818359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7328325608651644, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12306119461212686, + "kl": 0.02740478515625, + "learning_rate": 2.8228414992400217e-07, + "loss": 0.1132, + "num_tokens": 1892815761.0, + "reward": 2.3956475257873535, + "reward_std": 0.5686297416687012, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.170974463224411, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1064.122802734375, + "completions/mean_terminated_length": 799.3399047851562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7330456555324703, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13123760740262957, + "kl": 0.02606201171875, + "learning_rate": 2.820151452095083e-07, + "loss": 0.1018, + "num_tokens": 1893369560.0, + "reward": 2.4341518878936768, + "reward_std": 0.4745834469795227, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16057941317558289, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 986.5067138671875, + "completions/mean_terminated_length": 759.2493286132812, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.7332587501997763, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12208238629650009, + "kl": 0.026336669921875, + "learning_rate": 2.817462888041193e-07, + "loss": 0.0947, + "num_tokens": 1893879979.0, + "reward": 2.4609375, + "reward_std": 0.4501286447048187, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13915444910526276, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1039.134033203125, + "completions/mean_terminated_length": 826.4541015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7334718448670822, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11353960645070311, + "kl": 0.023681640625, + "learning_rate": 2.814775808566263e-07, + "loss": 0.0331, + "num_tokens": 1894415815.0, + "reward": 2.478236675262451, + "reward_std": 0.3729163110256195, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09668362140655518, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 895.638427734375, + "completions/mean_terminated_length": 788.8341674804688, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.7336849395343882, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14788515588120468, + "kl": 0.032745361328125, + "learning_rate": 2.812090215157388e-07, + "loss": 0.0939, + "num_tokens": 1894885125.0, + "reward": 2.5825893878936768, + "reward_std": 0.43872371315956116, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.46403056383132935, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13636787235736847, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 967.0781860351562, + "completions/mean_terminated_length": 776.9947509765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7338980342016941, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14387623739469202, + "kl": 0.028533935546875, + "learning_rate": 2.809406109300834e-07, + "loss": 0.0971, + "num_tokens": 1895387576.0, + "reward": 2.46484375, + "reward_std": 0.5205824971199036, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.18014755845069885, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1071.3773193359375, + "completions/mean_terminated_length": 764.9296264648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7341111288690001, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13057785123646307, + "kl": 0.0240478515625, + "learning_rate": 2.806723492482053e-07, + "loss": 0.0976, + "num_tokens": 1895942129.0, + "reward": 2.404576063156128, + "reward_std": 0.5068159103393555, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16403579711914062, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1103.997802734375, + "completions/mean_terminated_length": 804.1382446289062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.734324223536306, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12612839498942743, + "kl": 0.024810791015625, + "learning_rate": 2.804042366185667e-07, + "loss": 0.0779, + "num_tokens": 1896503616.0, + "reward": 2.368861675262451, + "reward_std": 0.45546820759773254, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.1728263646364212, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1110.015625, + "completions/mean_terminated_length": 840.4798583984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7345373182036119, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12849803606219865, + "kl": 0.024688720703125, + "learning_rate": 2.8013627318954723e-07, + "loss": 0.0882, + "num_tokens": 1897069575.0, + "reward": 2.2354912757873535, + "reward_std": 0.43667352199554443, + "rewards/accuracy_reward/mean": 0.37731480598449707, + "rewards/accuracy_reward/std": 0.4852766990661621, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16988763213157654, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 964.7254638671875, + "completions/mean_terminated_length": 722.0245971679688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7347504128709179, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14044283063040156, + "kl": 0.02764892578125, + "learning_rate": 2.7986845910944433e-07, + "loss": 0.1366, + "num_tokens": 1897570284.0, + "reward": 2.447544813156128, + "reward_std": 0.45131516456604004, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16988763213157654, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 988.7545166015625, + "completions/mean_terminated_length": 808.9869384765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7349635075382238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1522361476399345, + "kl": 0.029388427734375, + "learning_rate": 2.7960079452647223e-07, + "loss": 0.097, + "num_tokens": 1898082286.0, + "reward": 2.380580425262451, + "reward_std": 0.5102304220199585, + "rewards/accuracy_reward/mean": 0.5092592835426331, + "rewards/accuracy_reward/std": 0.5004938840866089, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16460275650024414, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 883.72998046875, + "completions/mean_terminated_length": 693.2129516601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7351766022055298, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1403254926354319, + "kl": 0.03173828125, + "learning_rate": 2.7933327958876353e-07, + "loss": 0.099, + "num_tokens": 1898544373.0, + "reward": 2.5011162757873535, + "reward_std": 0.39355558156967163, + "rewards/accuracy_reward/mean": 0.6111111044883728, + "rewards/accuracy_reward/std": 0.488063246011734, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559476852417, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1010.7031860351562, + "completions/mean_terminated_length": 798.7822875976562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7353896968728357, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13476578623333105, + "kl": 0.027618408203125, + "learning_rate": 2.790659144443665e-07, + "loss": 0.0865, + "num_tokens": 1899062496.0, + "reward": 2.5033483505249023, + "reward_std": 0.4906647503376007, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15788236260414124, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1025.3638916015625, + "completions/mean_terminated_length": 813.1185913085938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7356027915401417, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13425074228996248, + "kl": 0.027587890625, + "learning_rate": 2.7879869924124756e-07, + "loss": 0.1036, + "num_tokens": 1899590259.0, + "reward": 2.369419813156128, + "reward_std": 0.43779218196868896, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15391045808792114, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1128.1273193359375, + "completions/mean_terminated_length": 828.7603759765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7358158862074476, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12962469915678396, + "kl": 0.024993896484375, + "learning_rate": 2.7853163412729e-07, + "loss": 0.1006, + "num_tokens": 1900165532.0, + "reward": 2.3543527126312256, + "reward_std": 0.44905099272727966, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14025692641735077, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1066.7388916015625, + "completions/mean_terminated_length": 791.9857177734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7360289808747537, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13704990446898468, + "kl": 0.02703857421875, + "learning_rate": 2.782647192502938e-07, + "loss": 0.1069, + "num_tokens": 1900708487.0, + "reward": 2.41796875, + "reward_std": 0.49144795536994934, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.17302128672599792, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 958.3348388671875, + "completions/mean_terminated_length": 766.7139282226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7362420755420596, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13511312582994642, + "kl": 0.029541015625, + "learning_rate": 2.779979547579759e-07, + "loss": 0.0464, + "num_tokens": 1901201677.0, + "reward": 2.3582589626312256, + "reward_std": 0.4277730882167816, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401999473572, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1468711644411087, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1149.2254638671875, + "completions/mean_terminated_length": 856.724853515625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7364551702093655, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12135483923037568, + "kl": 0.02593994140625, + "learning_rate": 2.7773134079797e-07, + "loss": 0.1183, + "num_tokens": 1901790834.0, + "reward": 2.3705358505249023, + "reward_std": 0.4890635311603546, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.17407242953777313, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1024.4710693359375, + "completions/mean_terminated_length": 777.8032836914062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.7366682648766715, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12016402931133427, + "kl": 0.02642822265625, + "learning_rate": 2.7746487751782666e-07, + "loss": 0.0266, + "num_tokens": 1902324405.0, + "reward": 2.3895089626312256, + "reward_std": 0.41061267256736755, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1421368569135666, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 948.7545166015625, + "completions/mean_terminated_length": 782.0308227539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7368813595439774, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11516075086670463, + "kl": 0.02825927734375, + "learning_rate": 2.7719856506501306e-07, + "loss": 0.0744, + "num_tokens": 1902814423.0, + "reward": 2.493861675262451, + "reward_std": 0.408820778131485, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517839670181274, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1186.263427734375, + "completions/mean_terminated_length": 885.1746826171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.7370944542112834, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11188611030742304, + "kl": 0.0211181640625, + "learning_rate": 2.7693240358691216e-07, + "loss": 0.1032, + "num_tokens": 1903415533.0, + "reward": 2.341517925262451, + "reward_std": 0.44203147292137146, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17942628264427185, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1090.5648193359375, + "completions/mean_terminated_length": 891.8517456054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7373075488785893, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12408047414546623, + "kl": 0.026153564453125, + "learning_rate": 2.766663932308244e-07, + "loss": 0.0641, + "num_tokens": 1903973114.0, + "reward": 2.3588171005249023, + "reward_std": 0.5045943260192871, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15252020955085754, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1041.930908203125, + "completions/mean_terminated_length": 771.1755981445312, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7375206435458953, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1234701359575885, + "kl": 0.025848388671875, + "learning_rate": 2.7640053414396626e-07, + "loss": 0.0644, + "num_tokens": 1904509115.0, + "reward": 2.3856027126312256, + "reward_std": 0.44344204664230347, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11418405175209045, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1036.7991943359375, + "completions/mean_terminated_length": 757.3504028320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7377337382132012, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2512970781528178, + "kl": 0.027618408203125, + "learning_rate": 2.761348264734701e-07, + "loss": 0.0627, + "num_tokens": 1905054257.0, + "reward": 2.330357313156128, + "reward_std": 0.4192860722541809, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.128681018948555, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 953.7277221679688, + "completions/mean_terminated_length": 747.64453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7379468328805072, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13757513157259688, + "kl": 0.027252197265625, + "learning_rate": 2.7586927036638494e-07, + "loss": 0.0908, + "num_tokens": 1905553335.0, + "reward": 2.4056921005249023, + "reward_std": 0.36265894770622253, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780035495758057, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 977.80810546875, + "completions/mean_terminated_length": 779.6243286132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7381599275478131, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13983331108529778, + "kl": 0.028472900390625, + "learning_rate": 2.7560386596967553e-07, + "loss": 0.1025, + "num_tokens": 1906061825.0, + "reward": 2.4637277126312256, + "reward_std": 0.4490770697593689, + "rewards/accuracy_reward/mean": 0.5902777910232544, + "rewards/accuracy_reward/std": 0.49235257506370544, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16202956438064575, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 911.1116333007812, + "completions/mean_terminated_length": 731.9121704101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.738373022215119, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13964961402269552, + "kl": 0.0283203125, + "learning_rate": 2.7533861343022325e-07, + "loss": 0.0334, + "num_tokens": 1906541235.0, + "reward": 2.3426339626312256, + "reward_std": 0.3650625944137573, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13295157253742218, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1166.83935546875, + "completions/mean_terminated_length": 883.5162353515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.738586116882425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12765069854697872, + "kl": 0.02410888671875, + "learning_rate": 2.7507351289482495e-07, + "loss": 0.0736, + "num_tokens": 1907134811.0, + "reward": 2.345982313156128, + "reward_std": 0.5033276677131653, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.1695971041917801, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 904.8370971679688, + "completions/mean_terminated_length": 748.159912109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7387992115497309, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1440952967123607, + "kl": 0.031402587890625, + "learning_rate": 2.7480856451019334e-07, + "loss": 0.0952, + "num_tokens": 1907608722.0, + "reward": 2.583705425262451, + "reward_std": 0.40793558955192566, + "rewards/accuracy_reward/mean": 0.6763392686843872, + "rewards/accuracy_reward/std": 0.46839532256126404, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.11046778410673141, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1015.94873046875, + "completions/mean_terminated_length": 770.7651977539062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.739012306217037, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1221198046870945, + "kl": 0.02581787109375, + "learning_rate": 2.7454376842295725e-07, + "loss": 0.0641, + "num_tokens": 1908132091.0, + "reward": 2.377232313156128, + "reward_std": 0.3425656259059906, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09329447150230408, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 828.7611694335938, + "completions/mean_terminated_length": 675.5904541015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7392254008843429, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14017910963868296, + "kl": 0.0318603515625, + "learning_rate": 2.742791247796609e-07, + "loss": 0.0445, + "num_tokens": 1908568432.0, + "reward": 2.5206475257873535, + "reward_std": 0.363179087638855, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11317373067140579, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 997.0067138671875, + "completions/mean_terminated_length": 805.6649169921875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7394384955516489, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13436998160381114, + "kl": 0.027130126953125, + "learning_rate": 2.7401463372676435e-07, + "loss": 0.1046, + "num_tokens": 1909088995.0, + "reward": 2.3783483505249023, + "reward_std": 0.4117995798587799, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14969991147518158, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1033.384033203125, + "completions/mean_terminated_length": 832.6310424804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7396515902189548, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11285202153271146, + "kl": 0.025726318359375, + "learning_rate": 2.7375029541064304e-07, + "loss": 0.0455, + "num_tokens": 1909618719.0, + "reward": 2.3984375, + "reward_std": 0.35587483644485474, + "rewards/accuracy_reward/mean": 0.4791666567325592, + "rewards/accuracy_reward/std": 0.5001450181007385, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.10836843401193619, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 1078.0491943359375, + "completions/mean_terminated_length": 784.8081665039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7398646848862607, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.122578830556465, + "kl": 0.024749755859375, + "learning_rate": 2.7348610997758827e-07, + "loss": 0.1056, + "num_tokens": 1910172837.0, + "reward": 2.369419813156128, + "reward_std": 0.42102834582328796, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15645259618759155, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 998.279052734375, + "completions/mean_terminated_length": 776.9865112304688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7400777795535667, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1319045417106927, + "kl": 0.02923583984375, + "learning_rate": 2.7322207757380624e-07, + "loss": 0.0872, + "num_tokens": 1910697042.0, + "reward": 2.384486675262451, + "reward_std": 0.4264853894710541, + "rewards/accuracy_reward/mean": 0.5162037014961243, + "rewards/accuracy_reward/std": 0.5003167390823364, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.16169020533561707, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 879.4219360351562, + "completions/mean_terminated_length": 732.6155395507812, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7402908742208726, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13779882468186908, + "kl": 0.03167724609375, + "learning_rate": 2.729581983454187e-07, + "loss": 0.0398, + "num_tokens": 1911157215.0, + "reward": 2.4536831378936768, + "reward_std": 0.3912586569786072, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13903217017650604, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1032.763427734375, + "completions/mean_terminated_length": 770.3988647460938, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.7405039688881786, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12311671928460928, + "kl": 0.027923583984375, + "learning_rate": 2.726944724384627e-07, + "loss": 0.08, + "num_tokens": 1911691781.0, + "reward": 2.361049175262451, + "reward_std": 0.37111398577690125, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661912620067596, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 972.8504638671875, + "completions/mean_terminated_length": 753.1962280273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7407170635554845, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13943072372803567, + "kl": 0.027130126953125, + "learning_rate": 2.724308999988901e-07, + "loss": 0.0328, + "num_tokens": 1912191234.0, + "reward": 2.3895089626312256, + "reward_std": 0.4337301552295685, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16796617209911346, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1107.1741943359375, + "completions/mean_terminated_length": 822.7383422851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7409301582227905, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1308343998283261, + "kl": 0.02545166015625, + "learning_rate": 2.721674811725686e-07, + "loss": 0.1172, + "num_tokens": 1912751440.0, + "reward": 2.3448662757873535, + "reward_std": 0.4103396534919739, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1043.40625, + "completions/mean_terminated_length": 801.3019409179688, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.7411432528900964, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1365424929180653, + "kl": 0.02545166015625, + "learning_rate": 2.719042161052796e-07, + "loss": 0.0948, + "num_tokens": 1913290118.0, + "reward": 2.3895089626312256, + "reward_std": 0.5162388682365417, + "rewards/accuracy_reward/mean": 0.5185185074806213, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 888.8214721679688, + "completions/mean_terminated_length": 713.0076904296875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7413563475574024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1117017238388437, + "kl": 0.029327392578125, + "learning_rate": 2.716411049427209e-07, + "loss": 0.039, + "num_tokens": 1913757734.0, + "reward": 2.458705425262451, + "reward_std": 0.32308757305145264, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.08887429535388947, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1076.3660888671875, + "completions/mean_terminated_length": 775.2163696289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7415694422247083, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.5592069382020158, + "kl": 0.122894287109375, + "learning_rate": 2.7137814783050383e-07, + "loss": 0.0815, + "num_tokens": 1914314762.0, + "reward": 2.42578125, + "reward_std": 0.42657729983329773, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661914110183716, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1071.109375, + "completions/mean_terminated_length": 808.206787109375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7417825368920142, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11953810062825712, + "kl": 0.027099609375, + "learning_rate": 2.711153449141554e-07, + "loss": 0.0586, + "num_tokens": 1914865611.0, + "reward": 2.41796875, + "reward_std": 0.4024464786052704, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1312635987997055, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 915.8326416015625, + "completions/mean_terminated_length": 757.3867797851562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.7419956315593202, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14218067422316027, + "kl": 0.030426025390625, + "learning_rate": 2.708526963391167e-07, + "loss": 0.0845, + "num_tokens": 1915347024.0, + "reward": 2.537388563156128, + "reward_std": 0.388633131980896, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11660737544298172, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 985.4241333007812, + "completions/mean_terminated_length": 781.9520874023438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7422087262266261, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12886452836336423, + "kl": 0.02740478515625, + "learning_rate": 2.7059020225074354e-07, + "loss": 0.0613, + "num_tokens": 1915853070.0, + "reward": 2.447544813156128, + "reward_std": 0.4652882516384125, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16680268943309784, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1034.6116943359375, + "completions/mean_terminated_length": 820.9783935546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7424218208939322, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12501776245645774, + "kl": 0.025848388671875, + "learning_rate": 2.7032786279430656e-07, + "loss": 0.0596, + "num_tokens": 1916387424.0, + "reward": 2.3275671005249023, + "reward_std": 0.4162757098674774, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509721994400024, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 990.0535888671875, + "completions/mean_terminated_length": 851.1312866210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7426349155612381, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1328114569763756, + "kl": 0.028076171875, + "learning_rate": 2.7006567811499047e-07, + "loss": 0.0987, + "num_tokens": 1916900024.0, + "reward": 2.454799175262451, + "reward_std": 0.4706345796585083, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14126798510551453, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1134.2857666015625, + "completions/mean_terminated_length": 826.07763671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7428480102285441, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12279710045297361, + "kl": 0.025299072265625, + "learning_rate": 2.6980364835789444e-07, + "loss": 0.109, + "num_tokens": 1917482056.0, + "reward": 2.239955425262451, + "reward_std": 0.527755081653595, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15283602476119995, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 952.9063110351562, + "completions/mean_terminated_length": 770.390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.74306110489585, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13688211120565447, + "kl": 0.03094482421875, + "learning_rate": 2.695417736680318e-07, + "loss": 0.0976, + "num_tokens": 1917976862.0, + "reward": 2.4871652126312256, + "reward_std": 0.4360974431037903, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1061.4576416015625, + "completions/mean_terminated_length": 827.085693359375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7432741995631559, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12300436453475487, + "kl": 0.025787353515625, + "learning_rate": 2.692800541903302e-07, + "loss": 0.0812, + "num_tokens": 1918521307.0, + "reward": 2.4676339626312256, + "reward_std": 0.5184011459350586, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15063102543354034, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 936.1451416015625, + "completions/mean_terminated_length": 719.7039794921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7434872942304619, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12759943402613363, + "kl": 0.02984619140625, + "learning_rate": 2.690184900696313e-07, + "loss": 0.0449, + "num_tokens": 1919007596.0, + "reward": 2.5184152126312256, + "reward_std": 0.3949706256389618, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14869897067546844, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 954.7656860351562, + "completions/mean_terminated_length": 734.9464111328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7437003888977678, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14106229739273263, + "kl": 0.027740478515625, + "learning_rate": 2.6875708145069065e-07, + "loss": 0.0905, + "num_tokens": 1919501107.0, + "reward": 2.372767925262451, + "reward_std": 0.484919011592865, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16456104815006256, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1061.72998046875, + "completions/mean_terminated_length": 847.3233642578125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7439134835650738, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1163484796512355, + "kl": 0.026763916015625, + "learning_rate": 2.6849582847817843e-07, + "loss": 0.0334, + "num_tokens": 1920046554.0, + "reward": 2.4464287757873535, + "reward_std": 0.39093706011772156, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11805575340986252, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1114.0045166015625, + "completions/mean_terminated_length": 824.5204467773438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7441265782323797, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11948643836840166, + "kl": 0.025848388671875, + "learning_rate": 2.6823473129667787e-07, + "loss": 0.033, + "num_tokens": 1920625516.0, + "reward": 2.2081475257873535, + "reward_std": 0.4602033495903015, + "rewards/accuracy_reward/mean": 0.3526785671710968, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1787492334842682, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 818.4777221679688, + "completions/mean_terminated_length": 701.2371826171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7443396728996857, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1299716580565299, + "kl": 0.035369873046875, + "learning_rate": 2.6797379005068594e-07, + "loss": 0.0457, + "num_tokens": 1921053490.0, + "reward": 2.5736608505249023, + "reward_std": 0.37030670046806335, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12782442569732666, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 967.810302734375, + "completions/mean_terminated_length": 743.6199340820312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7445527675669916, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2883007918642504, + "kl": 0.0340576171875, + "learning_rate": 2.6771300488461405e-07, + "loss": 0.1406, + "num_tokens": 1921561917.0, + "reward": 2.4598214626312256, + "reward_std": 0.4820263981819153, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.16899244487285614, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1110.05810546875, + "completions/mean_terminated_length": 786.1441650390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7447658622342976, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.3346360506286928, + "kl": 0.027191162109375, + "learning_rate": 2.674523759427867e-07, + "loss": 0.1122, + "num_tokens": 1922136967.0, + "reward": 2.3286831378936768, + "reward_std": 0.427215039730072, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1454136222600937, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1096.4754638671875, + "completions/mean_terminated_length": 815.9682006835938, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.7449789569016035, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11798150624817462, + "kl": 0.02471923828125, + "learning_rate": 2.671919033694423e-07, + "loss": 0.0542, + "num_tokens": 1922697436.0, + "reward": 2.431361675262451, + "reward_std": 0.44612422585487366, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1217813789844513, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1071.712158203125, + "completions/mean_terminated_length": 805.4517211914062, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7451920515689094, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12965966948036092, + "kl": 0.025634765625, + "learning_rate": 2.66931587308732e-07, + "loss": 0.0701, + "num_tokens": 1923246539.0, + "reward": 2.4441964626312256, + "reward_std": 0.4448496103286743, + "rewards/accuracy_reward/mean": 0.5671296119689941, + "rewards/accuracy_reward/std": 0.4960475564002991, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13271193206310272, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1039.40185546875, + "completions/mean_terminated_length": 796.3323974609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7454051462362155, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13061103848622702, + "kl": 0.026519775390625, + "learning_rate": 2.6667142790472116e-07, + "loss": 0.0979, + "num_tokens": 1923783503.0, + "reward": 2.36328125, + "reward_std": 0.401105672121048, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1037.8125, + "completions/mean_terminated_length": 773.1718139648438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7456182409035214, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12107353137447877, + "kl": 0.02606201171875, + "learning_rate": 2.6641142530138814e-07, + "loss": 0.0749, + "num_tokens": 1924321595.0, + "reward": 2.3744421005249023, + "reward_std": 0.397959440946579, + "rewards/accuracy_reward/mean": 0.48148149251937866, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13519908487796783, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1033.0513916015625, + "completions/mean_terminated_length": 841.9071655273438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7458313355708274, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11664976188410903, + "kl": 0.025299072265625, + "learning_rate": 2.6615157964262436e-07, + "loss": 0.0846, + "num_tokens": 1924852562.0, + "reward": 2.454799175262451, + "reward_std": 0.42508262395858765, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13826683163642883, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 978.62060546875, + "completions/mean_terminated_length": 773.845703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7460444302381333, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13066019456290967, + "kl": 0.029022216796875, + "learning_rate": 2.658918910722344e-07, + "loss": 0.0762, + "num_tokens": 1925358536.0, + "reward": 2.4268975257873535, + "reward_std": 0.4405536949634552, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12909629940986633, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 983.0803833007812, + "completions/mean_terminated_length": 733.718994140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7462575249054393, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1397188026506883, + "kl": 0.028839111328125, + "learning_rate": 2.656323597339361e-07, + "loss": 0.112, + "num_tokens": 1925873756.0, + "reward": 2.5518975257873535, + "reward_std": 0.437661737203598, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1246887594461441, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1036.415283203125, + "completions/mean_terminated_length": 782.1061401367188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7464706195727452, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13525837584867187, + "kl": 0.028045654296875, + "learning_rate": 2.653729857713604e-07, + "loss": 0.0875, + "num_tokens": 1926412134.0, + "reward": 2.357701063156128, + "reward_std": 0.4112618565559387, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15551921725273132, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 955.9777221679688, + "completions/mean_terminated_length": 729.33154296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7466837142400511, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14044591093172423, + "kl": 0.02783203125, + "learning_rate": 2.651137693280506e-07, + "loss": 0.1089, + "num_tokens": 1926910364.0, + "reward": 2.439732313156128, + "reward_std": 0.4063902199268341, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1131.841552734375, + "completions/mean_terminated_length": 904.7158813476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7468968089073571, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11957806596376697, + "kl": 0.022857666015625, + "learning_rate": 2.6485471054746315e-07, + "loss": 0.0627, + "num_tokens": 1927487765.0, + "reward": 2.404017925262451, + "reward_std": 0.47729891538619995, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 854.4710083007812, + "completions/mean_terminated_length": 640.8921508789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.747109903574663, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13722161850513545, + "kl": 0.03155517578125, + "learning_rate": 2.6459580957296757e-07, + "loss": 0.0681, + "num_tokens": 1927931800.0, + "reward": 2.40625, + "reward_std": 0.3701859414577484, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09585530310869217, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1142.013427734375, + "completions/mean_terminated_length": 885.0143432617188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.747322998241969, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11558254378589872, + "kl": 0.024444580078125, + "learning_rate": 2.6433706654784555e-07, + "loss": 0.0318, + "num_tokens": 1928523918.0, + "reward": 2.4564733505249023, + "reward_std": 0.40328356623649597, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11752051115036011, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 927.4107666015625, + "completions/mean_terminated_length": 733.801025390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7475360929092749, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1312303597305569, + "kl": 0.0301513671875, + "learning_rate": 2.640784816152916e-07, + "loss": 0.0414, + "num_tokens": 1929010150.0, + "reward": 2.5066964626312256, + "reward_std": 0.4345788359642029, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1299937218427658, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 918.7902221679688, + "completions/mean_terminated_length": 747.5218505859375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7477491875765809, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12394669621573583, + "kl": 0.03009033203125, + "learning_rate": 2.6382005491841244e-07, + "loss": 0.0181, + "num_tokens": 1929489688.0, + "reward": 2.3878350257873535, + "reward_std": 0.39658868312835693, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11614610999822617, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 885.904052734375, + "completions/mean_terminated_length": 719.8903198242188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7479622822438868, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.5240799943464431, + "kl": 0.054046630859375, + "learning_rate": 2.635617866002278e-07, + "loss": 0.0685, + "num_tokens": 1929960253.0, + "reward": 2.4871652126312256, + "reward_std": 0.4223470091819763, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12242549657821655, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 880.5535888671875, + "completions/mean_terminated_length": 671.6421508789062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7481753769111928, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.1479007680914857, + "kl": 0.030303955078125, + "learning_rate": 2.633036768036695e-07, + "loss": 0.0914, + "num_tokens": 1930423189.0, + "reward": 2.400111675262451, + "reward_std": 0.3164609670639038, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1217813715338707, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1149.5960693359375, + "completions/mean_terminated_length": 850.1279907226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7483884715784987, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12626343790534925, + "kl": 0.025054931640625, + "learning_rate": 2.6304572567158105e-07, + "loss": 0.137, + "num_tokens": 1931011232.0, + "reward": 2.3565850257873535, + "reward_std": 0.4179708659648895, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550675868988, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15266746282577515, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1029.2567138671875, + "completions/mean_terminated_length": 758.742919921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7486015662458047, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12138239803794859, + "kl": 0.025634765625, + "learning_rate": 2.627879333467191e-07, + "loss": 0.0658, + "num_tokens": 1931544803.0, + "reward": 2.39453125, + "reward_std": 0.42185673117637634, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15968577563762665, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1019.8460083007812, + "completions/mean_terminated_length": 768.5194702148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7488146609131107, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11616291493775444, + "kl": 0.02825927734375, + "learning_rate": 2.6253029997175186e-07, + "loss": 0.0531, + "num_tokens": 1932068110.0, + "reward": 2.3448662757873535, + "reward_std": 0.3450391888618469, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12292034178972244, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 857.2656860351562, + "completions/mean_terminated_length": 680.1820678710938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7490277555804166, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1265011559597205, + "kl": 0.028167724609375, + "learning_rate": 2.622728256892597e-07, + "loss": 0.0753, + "num_tokens": 1932525749.0, + "reward": 2.560826063156128, + "reward_std": 0.3422622084617615, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.485796183347702, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067523807287216, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 996.3035888671875, + "completions/mean_terminated_length": 771.1436767578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7492408502477226, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13357169768061244, + "kl": 0.026824951171875, + "learning_rate": 2.620155106417348e-07, + "loss": 0.0652, + "num_tokens": 1933045437.0, + "reward": 2.501674175262451, + "reward_std": 0.4418174624443054, + "rewards/accuracy_reward/mean": 0.6157407164573669, + "rewards/accuracy_reward/std": 0.48698359727859497, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14126798510551453, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 945.1094360351562, + "completions/mean_terminated_length": 781.0897827148438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7494539449150285, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1338695465535572, + "kl": 0.029541015625, + "learning_rate": 2.6175835497158125e-07, + "loss": 0.0831, + "num_tokens": 1933533438.0, + "reward": 2.482701063156128, + "reward_std": 0.4090162217617035, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11801211535930634, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 859.3192138671875, + "completions/mean_terminated_length": 689.5076293945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7496670395823345, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1504201522100094, + "kl": 0.032867431640625, + "learning_rate": 2.6150135882111544e-07, + "loss": 0.0933, + "num_tokens": 1933985309.0, + "reward": 2.532924175262451, + "reward_std": 0.42589065432548523, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1116.1607666015625, + "completions/mean_terminated_length": 862.0227661132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7498801342496404, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2604814505094918, + "kl": 0.025360107421875, + "learning_rate": 2.612445223325648e-07, + "loss": 0.0593, + "num_tokens": 1934567189.0, + "reward": 2.3136162757873535, + "reward_std": 0.44190889596939087, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1549774408340454, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 1072.0067138671875, + "completions/mean_terminated_length": 812.8446044921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7500932289169464, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12756135792043935, + "kl": 0.026397705078125, + "learning_rate": 2.6098784564806875e-07, + "loss": 0.1015, + "num_tokens": 1935121880.0, + "reward": 2.3705358505249023, + "reward_std": 0.4008837640285492, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 901.2723388671875, + "completions/mean_terminated_length": 663.272216796875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7503063235842523, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14866352343964825, + "kl": 0.034820556640625, + "learning_rate": 2.607313289096779e-07, + "loss": 0.0479, + "num_tokens": 1935590114.0, + "reward": 2.439732313156128, + "reward_std": 0.3268306255340576, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1082.5648193359375, + "completions/mean_terminated_length": 853.2072143554688, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.7505194182515582, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11294717013774547, + "kl": 0.024505615234375, + "learning_rate": 2.6047497225935523e-07, + "loss": 0.0836, + "num_tokens": 1936151231.0, + "reward": 2.4229912757873535, + "reward_std": 0.39541906118392944, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11141301691532135, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1130.399658203125, + "completions/mean_terminated_length": 831.772216796875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7507325129188642, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1206857297800884, + "kl": 0.026702880859375, + "learning_rate": 2.60218775838974e-07, + "loss": 0.0567, + "num_tokens": 1936725042.0, + "reward": 2.306361675262451, + "reward_std": 0.48713958263397217, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.18550105392932892, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 983.90185546875, + "completions/mean_terminated_length": 738.3406982421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7509456075861701, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14549911823578737, + "kl": 0.02783203125, + "learning_rate": 2.599627397903193e-07, + "loss": 0.1225, + "num_tokens": 1937240182.0, + "reward": 2.4296875, + "reward_std": 0.5068896412849426, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15465489029884338, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1045.09375, + "completions/mean_terminated_length": 753.1815185546875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7511587022534761, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13782383556429997, + "kl": 0.027618408203125, + "learning_rate": 2.5970686425508783e-07, + "loss": 0.0618, + "num_tokens": 1937778736.0, + "reward": 2.36328125, + "reward_std": 0.4292701482772827, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11734377592802048, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 945.8594360351562, + "completions/mean_terminated_length": 731.309326171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.751371796920782, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13754740482309039, + "kl": 0.0283203125, + "learning_rate": 2.59451149374887e-07, + "loss": 0.046, + "num_tokens": 1938268353.0, + "reward": 2.4135046005249023, + "reward_std": 0.3300292193889618, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10940459370613098, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 919.58935546875, + "completions/mean_terminated_length": 777.8291625976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.751584891588088, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1174617002064403, + "kl": 0.028533935546875, + "learning_rate": 2.591955952912353e-07, + "loss": 0.0473, + "num_tokens": 1938753241.0, + "reward": 2.5870537757873535, + "reward_std": 0.4012986719608307, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.10325382649898529, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1010.7902221679688, + "completions/mean_terminated_length": 798.8870849609375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.751797986255394, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12752559118865228, + "kl": 0.025482177734375, + "learning_rate": 2.5894020214556246e-07, + "loss": 0.0881, + "num_tokens": 1939279755.0, + "reward": 2.3482143878936768, + "reward_std": 0.4496425986289978, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15220166742801666, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 912.779052734375, + "completions/mean_terminated_length": 720.1174926757812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7520110809226999, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1424765568988169, + "kl": 0.034393310546875, + "learning_rate": 2.5868497007920887e-07, + "loss": 0.107, + "num_tokens": 1939766936.0, + "reward": 2.4654018878936768, + "reward_std": 0.3953595459461212, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.09196697175502777, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1044.4710693359375, + "completions/mean_terminated_length": 812.8873901367188, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7522241755900059, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11476163690264074, + "kl": 0.02606201171875, + "learning_rate": 2.584298992334263e-07, + "loss": 0.0567, + "num_tokens": 1940305547.0, + "reward": 2.4408483505249023, + "reward_std": 0.4668347239494324, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14744803309440613, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1118.15185546875, + "completions/mean_terminated_length": 847.5042724609375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7524372702573118, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12287073235266664, + "kl": 0.028106689453125, + "learning_rate": 2.5817498974937654e-07, + "loss": 0.0793, + "num_tokens": 1940875615.0, + "reward": 2.2583706378936768, + "reward_std": 0.37075507640838623, + "rewards/accuracy_reward/mean": 0.3638392984867096, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1043.5625, + "completions/mean_terminated_length": 808.3636474609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7526503649246178, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11882026885742407, + "kl": 0.025421142578125, + "learning_rate": 2.579202417681328e-07, + "loss": 0.0685, + "num_tokens": 1941408027.0, + "reward": 2.282924175262451, + "reward_std": 0.38527238368988037, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13099703192710876, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1056.9576416015625, + "completions/mean_terminated_length": 844.783203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7528634595919237, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11700048784282295, + "kl": 0.0260009765625, + "learning_rate": 2.576656554306783e-07, + "loss": 0.0528, + "num_tokens": 1941954696.0, + "reward": 2.423549175262451, + "reward_std": 0.398449569940567, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1480342447757721, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1007.8906860351562, + "completions/mean_terminated_length": 795.3951416015625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7530765542592297, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1280614576504268, + "kl": 0.0267333984375, + "learning_rate": 2.5741123087790734e-07, + "loss": 0.1171, + "num_tokens": 1942473463.0, + "reward": 2.41015625, + "reward_std": 0.41179290413856506, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692909002304, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1018.8660888671875, + "completions/mean_terminated_length": 808.6129150390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7532896489265356, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14203926026606373, + "kl": 0.02801513671875, + "learning_rate": 2.5715696825062426e-07, + "loss": 0.1109, + "num_tokens": 1942999051.0, + "reward": 2.3939733505249023, + "reward_std": 0.4207933843135834, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.11730785667896271, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1063.3616943359375, + "completions/mean_terminated_length": 801.9039306640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7535027435938416, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12285770090197136, + "kl": 0.025726318359375, + "learning_rate": 2.5690286768954395e-07, + "loss": 0.038, + "num_tokens": 1943545357.0, + "reward": 2.4871652126312256, + "reward_std": 0.4131576120853424, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13124457001686096, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 954.2589721679688, + "completions/mean_terminated_length": 771.96875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7537158382611475, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14278764091455756, + "kl": 0.030029296875, + "learning_rate": 2.566489293352918e-07, + "loss": 0.0936, + "num_tokens": 1944039169.0, + "reward": 2.51953125, + "reward_std": 0.4585893452167511, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1247187927365303, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1148.265625, + "completions/mean_terminated_length": 879.6492919921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7539289329284534, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11359663314860509, + "kl": 0.021759033203125, + "learning_rate": 2.5639515332840324e-07, + "loss": 0.0834, + "num_tokens": 1944627192.0, + "reward": 2.3515625, + "reward_std": 0.38736072182655334, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.4966535270214081, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11752051115036011, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1099.57373046875, + "completions/mean_terminated_length": 844.3314208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7541420275957594, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11785983211313968, + "kl": 0.02374267578125, + "learning_rate": 2.5614153980932366e-07, + "loss": 0.0596, + "num_tokens": 1945191561.0, + "reward": 2.4291296005249023, + "reward_std": 0.4149930775165558, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10092891752719879, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1093.9888916015625, + "completions/mean_terminated_length": 775.985107421875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7543551222630653, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11803431143361388, + "kl": 0.02618408203125, + "learning_rate": 2.5588808891840897e-07, + "loss": 0.068, + "num_tokens": 1945748212.0, + "reward": 2.4681921005249023, + "reward_std": 0.4166988134384155, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12428762763738632, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1012.341552734375, + "completions/mean_terminated_length": 790.6151733398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7545682169303713, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13351143303602822, + "kl": 0.027618408203125, + "learning_rate": 2.556348007959246e-07, + "loss": 0.0415, + "num_tokens": 1946281181.0, + "reward": 2.4330358505249023, + "reward_std": 0.38546356558799744, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1033746674656868, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1020.3906860351562, + "completions/mean_terminated_length": 813.7667846679688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7547813115976773, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14625910080746216, + "kl": 0.030517578125, + "learning_rate": 2.5538167558204625e-07, + "loss": 0.0693, + "num_tokens": 1946807692.0, + "reward": 2.466517925262451, + "reward_std": 0.4417068362236023, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11686538904905319, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 991.0870971679688, + "completions/mean_terminated_length": 801.955322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7549944062649833, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1276555336301265, + "kl": 0.02886962890625, + "learning_rate": 2.551287134168593e-07, + "loss": 0.0579, + "num_tokens": 1947321491.0, + "reward": 2.385044813156128, + "reward_std": 0.44310033321380615, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14876298606395721, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1056.5670166015625, + "completions/mean_terminated_length": 803.8487548828125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7552075009322892, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1283431741677651, + "kl": 0.02557373046875, + "learning_rate": 2.548759144403591e-07, + "loss": 0.0955, + "num_tokens": 1947871825.0, + "reward": 2.2896206378936768, + "reward_std": 0.41937577724456787, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15122967958450317, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1091.5023193359375, + "completions/mean_terminated_length": 851.0418701171875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.7554205955995951, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13434317132116813, + "kl": 0.025634765625, + "learning_rate": 2.5462327879245064e-07, + "loss": 0.068, + "num_tokens": 1948429394.0, + "reward": 2.4224331378936768, + "reward_std": 0.5017857551574707, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15641570091247559, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1032.888427734375, + "completions/mean_terminated_length": 835.2799682617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7556336902669011, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13709049878649024, + "kl": 0.026947021484375, + "learning_rate": 2.5437080661294785e-07, + "loss": 0.0992, + "num_tokens": 1948959376.0, + "reward": 2.3666296005249023, + "reward_std": 0.4290775656700134, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12338031083345413, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 984.4107666015625, + "completions/mean_terminated_length": 753.1956787109375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.755846784934207, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12042122035549503, + "kl": 0.02716064453125, + "learning_rate": 2.5411849804157524e-07, + "loss": 0.0659, + "num_tokens": 1949464808.0, + "reward": 2.396205425262451, + "reward_std": 0.30729636549949646, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14409084618091583, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 952.1763916015625, + "completions/mean_terminated_length": 776.1632080078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.756059879601513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1354007590023983, + "kl": 0.030548095703125, + "learning_rate": 2.5386635321796613e-07, + "loss": 0.0733, + "num_tokens": 1949961575.0, + "reward": 2.4810268878936768, + "reward_std": 0.4675096273422241, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.14819122850894928, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1005.5625610351562, + "completions/mean_terminated_length": 782.3848266601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7562729742688189, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1397903050341739, + "kl": 0.026397705078125, + "learning_rate": 2.536143722816636e-07, + "loss": 0.1048, + "num_tokens": 1950485955.0, + "reward": 2.3080358505249023, + "reward_std": 0.42131131887435913, + "rewards/accuracy_reward/mean": 0.42824074625968933, + "rewards/accuracy_reward/std": 0.4953974783420563, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13840332627296448, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 972.93310546875, + "completions/mean_terminated_length": 790.4804077148438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7564860689361249, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1291851724264226, + "kl": 0.0277099609375, + "learning_rate": 2.5336255537211935e-07, + "loss": 0.0408, + "num_tokens": 1950995205.0, + "reward": 2.4068081378936768, + "reward_std": 0.4471297860145569, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13017487525939941, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 948.122802734375, + "completions/mean_terminated_length": 747.8812866210938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7566991636034308, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13055931442172414, + "kl": 0.02972412109375, + "learning_rate": 2.531109026286952e-07, + "loss": 0.0468, + "num_tokens": 1951492700.0, + "reward": 2.4559152126312256, + "reward_std": 0.452176570892334, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1395072489976883, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1001.3928833007812, + "completions/mean_terminated_length": 784.1724853515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7569122582707368, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12868678362427405, + "kl": 0.028961181640625, + "learning_rate": 2.5285941419066155e-07, + "loss": 0.093, + "num_tokens": 1952006332.0, + "reward": 2.5541296005249023, + "reward_std": 0.44663339853286743, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.13750630617141724, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1061.3125, + "completions/mean_terminated_length": 795.7733764648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7571253529380427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13779584654039653, + "kl": 0.028656005859375, + "learning_rate": 2.5260809019719794e-07, + "loss": 0.1017, + "num_tokens": 1952548792.0, + "reward": 2.2622768878936768, + "reward_std": 0.5183950662612915, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1634153127670288, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1018.90185546875, + "completions/mean_terminated_length": 774.419921875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7573384476053486, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12245419355886863, + "kl": 0.028900146484375, + "learning_rate": 2.5235693078739304e-07, + "loss": 0.0375, + "num_tokens": 1953078508.0, + "reward": 2.4732143878936768, + "reward_std": 0.3816849887371063, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13165414333343506, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 985.1406860351562, + "completions/mean_terminated_length": 788.3148193359375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7575515422726546, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12065835306994759, + "kl": 0.02655029296875, + "learning_rate": 2.521059361002441e-07, + "loss": 0.0941, + "num_tokens": 1953588443.0, + "reward": 2.3130581378936768, + "reward_std": 0.4270966947078705, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13170984387397766, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1100.384033203125, + "completions/mean_terminated_length": 845.3597412109375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7577646369399605, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12398138916722021, + "kl": 0.02642822265625, + "learning_rate": 2.51855106274658e-07, + "loss": 0.0533, + "num_tokens": 1954154871.0, + "reward": 2.3716518878936768, + "reward_std": 0.38874921202659607, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15208269655704498, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1026.6920166015625, + "completions/mean_terminated_length": 791.0054931640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7579777316072666, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13494639588625432, + "kl": 0.026153564453125, + "learning_rate": 2.5160444144944936e-07, + "loss": 0.0969, + "num_tokens": 1954701933.0, + "reward": 2.412388563156128, + "reward_std": 0.4836609363555908, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13878051936626434, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1043.7410888671875, + "completions/mean_terminated_length": 766.2108154296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7581908262745725, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.6311853281922855, + "kl": 0.05853271484375, + "learning_rate": 2.51353941763342e-07, + "loss": 0.1118, + "num_tokens": 1955242793.0, + "reward": 2.404576063156128, + "reward_std": 0.47136369347572327, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16824373602867126, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1031.071533203125, + "completions/mean_terminated_length": 789.480712890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7584039209418785, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16113232936407149, + "kl": 0.028411865234375, + "learning_rate": 2.511036073549687e-07, + "loss": 0.0945, + "num_tokens": 1955779145.0, + "reward": 2.3353796005249023, + "reward_std": 0.4416998624801636, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14827017486095428, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 958.544677734375, + "completions/mean_terminated_length": 746.4639892578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7586170156091844, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.14311874398246424, + "kl": 0.02728271484375, + "learning_rate": 2.508534383628701e-07, + "loss": 0.1244, + "num_tokens": 1956275597.0, + "reward": 2.4916296005249023, + "reward_std": 0.37237876653671265, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1222418025135994, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1029.08935546875, + "completions/mean_terminated_length": 793.9560546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7588301102764903, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12746763745529244, + "kl": 0.027496337890625, + "learning_rate": 2.506034349254956e-07, + "loss": 0.0477, + "num_tokens": 1956804597.0, + "reward": 2.4659600257873535, + "reward_std": 0.47420525550842285, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.1539783626794815, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 926.5625610351562, + "completions/mean_terminated_length": 753.144287109375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7590432049437963, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13528317209219723, + "kl": 0.029449462890625, + "learning_rate": 2.50353597181203e-07, + "loss": 0.0263, + "num_tokens": 1957292385.0, + "reward": 2.4609375, + "reward_std": 0.42423495650291443, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14213687181472778, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 937.40185546875, + "completions/mean_terminated_length": 724.7340087890625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.7592562996111022, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.131637857665679, + "kl": 0.026336669921875, + "learning_rate": 2.5010392526825845e-07, + "loss": 0.097, + "num_tokens": 1957781733.0, + "reward": 2.515625, + "reward_std": 0.36496657133102417, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11409792304039001, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 945.8170166015625, + "completions/mean_terminated_length": 702.5558471679688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7594693942784082, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1315357767767974, + "kl": 0.029296875, + "learning_rate": 2.498544193248363e-07, + "loss": 0.0655, + "num_tokens": 1958273651.0, + "reward": 2.4681921005249023, + "reward_std": 0.4260401129722595, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13776934146881104, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 936.8303833007812, + "completions/mean_terminated_length": 765.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7596824889457141, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13561159221762725, + "kl": 0.029754638671875, + "learning_rate": 2.496050794890186e-07, + "loss": 0.0826, + "num_tokens": 1958760647.0, + "reward": 2.369419813156128, + "reward_std": 0.4852680563926697, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1361066997051239, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1091.9241943359375, + "completions/mean_terminated_length": 831.1761474609375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7598955836130201, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.11312988337358344, + "kl": 0.023773193359375, + "learning_rate": 2.4935590589879627e-07, + "loss": 0.0704, + "num_tokens": 1959319685.0, + "reward": 2.3956475257873535, + "reward_std": 0.39094406366348267, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14249980449676514, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1109.8326416015625, + "completions/mean_terminated_length": 896.4959106445312, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.760108678280326, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11541046608609773, + "kl": 0.02362060546875, + "learning_rate": 2.491068986920677e-07, + "loss": 0.0463, + "num_tokens": 1959895178.0, + "reward": 2.3705358505249023, + "reward_std": 0.4613640308380127, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13989263772964478, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1019.8817138671875, + "completions/mean_terminated_length": 826.2572631835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.760321772947632, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1165216032864574, + "kl": 0.02740478515625, + "learning_rate": 2.4885805800663927e-07, + "loss": 0.0316, + "num_tokens": 1960423029.0, + "reward": 2.3097100257873535, + "reward_std": 0.3930506110191345, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14801737666130066, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1045.3616943359375, + "completions/mean_terminated_length": 817.3643798828125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7605348676149379, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13967905785880028, + "kl": 0.028167724609375, + "learning_rate": 2.486093839802253e-07, + "loss": 0.089, + "num_tokens": 1960957991.0, + "reward": 2.4838171005249023, + "reward_std": 0.41798660159111023, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1049.296875, + "completions/mean_terminated_length": 784.1044921875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7607479622822438, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13017657702219562, + "kl": 0.029144287109375, + "learning_rate": 2.483608767504477e-07, + "loss": 0.0269, + "num_tokens": 1961489436.0, + "reward": 2.345424175262451, + "reward_std": 0.46885377168655396, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14078108966350555, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 962.7232666015625, + "completions/mean_terminated_length": 781.84375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7609610569495499, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16573863707475095, + "kl": 0.02923583984375, + "learning_rate": 2.481125364548364e-07, + "loss": 0.0867, + "num_tokens": 1961992592.0, + "reward": 2.470424175262451, + "reward_std": 0.4638521671295166, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.1539783775806427, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 952.0022583007812, + "completions/mean_terminated_length": 749.0396728515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7611741516168558, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12540751817174545, + "kl": 0.027740478515625, + "learning_rate": 2.478643632308287e-07, + "loss": 0.0506, + "num_tokens": 1962486625.0, + "reward": 2.4068081378936768, + "reward_std": 0.3216070532798767, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1119314506649971, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 955.22998046875, + "completions/mean_terminated_length": 782.9844970703125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7613872462841618, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.4512675117178448, + "kl": 0.096282958984375, + "learning_rate": 2.476163572157694e-07, + "loss": 0.1696, + "num_tokens": 1962989352.0, + "reward": 2.494419813156128, + "reward_std": 0.5137013792991638, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 935.58935546875, + "completions/mean_terminated_length": 708.3225708007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7616003409514677, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.140295697425878, + "kl": 0.0291748046875, + "learning_rate": 2.4736851854691075e-07, + "loss": 0.1019, + "num_tokens": 1963470848.0, + "reward": 2.373326063156128, + "reward_std": 0.4100281298160553, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17818261682987213, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 999.38623046875, + "completions/mean_terminated_length": 788.5388793945312, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.7618134356187737, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1343333206956619, + "kl": 0.028717041015625, + "learning_rate": 2.47120847361413e-07, + "loss": 0.0761, + "num_tokens": 1963983117.0, + "reward": 2.4425225257873535, + "reward_std": 0.44747480750083923, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14294590055942535, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1120.5379638671875, + "completions/mean_terminated_length": 870.9376831054688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.7620265302860796, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11180280074402434, + "kl": 0.025604248046875, + "learning_rate": 2.4687334379634257e-07, + "loss": 0.0514, + "num_tokens": 1964559534.0, + "reward": 2.3487725257873535, + "reward_std": 0.43206730484962463, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14102916419506073, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 998.3147583007812, + "completions/mean_terminated_length": 800.628662109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7622396249533855, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15631073540325244, + "kl": 0.0303955078125, + "learning_rate": 2.466260079886738e-07, + "loss": 0.0805, + "num_tokens": 1965079067.0, + "reward": 2.4765625, + "reward_std": 0.4737948477268219, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.158765509724617, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1006.4620971679688, + "completions/mean_terminated_length": 807.0186157226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7624527196206915, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12244953831029336, + "kl": 0.0260009765625, + "learning_rate": 2.4637884007528856e-07, + "loss": 0.0899, + "num_tokens": 1965601466.0, + "reward": 2.4458706378936768, + "reward_std": 0.3801722824573517, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13669590651988983, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 860.5402221679688, + "completions/mean_terminated_length": 704.611083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7626658142879974, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.17167211218465026, + "kl": 0.02978515625, + "learning_rate": 2.46131840192975e-07, + "loss": 0.111, + "num_tokens": 1966055004.0, + "reward": 2.556361675262451, + "reward_std": 0.396171510219574, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.126290425658226, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 959.5558471679688, + "completions/mean_terminated_length": 744.1951904296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7628789089553034, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1243076404594816, + "kl": 0.028167724609375, + "learning_rate": 2.4588500847842886e-07, + "loss": 0.058, + "num_tokens": 1966553221.0, + "reward": 2.4888393878936768, + "reward_std": 0.40750470757484436, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09289216995239258, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1021.3214721679688, + "completions/mean_terminated_length": 821.4613037109375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7630920036226093, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1290515701931414, + "kl": 0.025146484375, + "learning_rate": 2.4563834506825254e-07, + "loss": 0.0882, + "num_tokens": 1967075525.0, + "reward": 2.3582589626312256, + "reward_std": 0.36674249172210693, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14649668335914612, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1036.1629638671875, + "completions/mean_terminated_length": 778.2437133789062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7633050982899153, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13772930751318233, + "kl": 0.026123046875, + "learning_rate": 2.4539185009895514e-07, + "loss": 0.1087, + "num_tokens": 1967611214.0, + "reward": 2.3286831378936768, + "reward_std": 0.4171675443649292, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16573180258274078, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 882.7433471679688, + "completions/mean_terminated_length": 716.278076171875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.7635181929572212, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13364333942123158, + "kl": 0.030364990234375, + "learning_rate": 2.451455237069532e-07, + "loss": 0.0478, + "num_tokens": 1968072859.0, + "reward": 2.447544813156128, + "reward_std": 0.3564707338809967, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12016765028238297, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1016.4107666015625, + "completions/mean_terminated_length": 831.810546875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7637312876245272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14727939565231107, + "kl": 0.030487060546875, + "learning_rate": 2.44899366028569e-07, + "loss": 0.0676, + "num_tokens": 1968600467.0, + "reward": 2.4575893878936768, + "reward_std": 0.49572864174842834, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14033812284469604, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1025.0513916015625, + "completions/mean_terminated_length": 799.2778930664062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7639443822918331, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13558707228149589, + "kl": 0.026275634765625, + "learning_rate": 2.4465337720003237e-07, + "loss": 0.0827, + "num_tokens": 1969129754.0, + "reward": 2.46875, + "reward_std": 0.3993665277957916, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 884.5178833007812, + "completions/mean_terminated_length": 669.0581665039062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.764157476959139, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1400686037826665, + "kl": 0.031524658203125, + "learning_rate": 2.444075573574792e-07, + "loss": 0.0527, + "num_tokens": 1969594274.0, + "reward": 2.47265625, + "reward_std": 0.38611456751823425, + "rewards/accuracy_reward/mean": 0.5856481194496155, + "rewards/accuracy_reward/std": 0.49318093061447144, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10346972197294235, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 991.46435546875, + "completions/mean_terminated_length": 747.6483764648438, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7643705716264451, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1535987161073911, + "kl": 0.0294189453125, + "learning_rate": 2.441619066369519e-07, + "loss": 0.1006, + "num_tokens": 1970109090.0, + "reward": 2.428013563156128, + "reward_std": 0.5237821340560913, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.18275512754917145, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1048.872802734375, + "completions/mean_terminated_length": 828.35693359375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.764583666293751, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11458527636540096, + "kl": 0.0247802734375, + "learning_rate": 2.4391642517439935e-07, + "loss": 0.0715, + "num_tokens": 1970649385.0, + "reward": 2.4419643878936768, + "reward_std": 0.4425899088382721, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15971019864082336, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1065.8951416015625, + "completions/mean_terminated_length": 855.6341552734375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.764796760961057, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.5952769315013996, + "kl": 0.038543701171875, + "learning_rate": 2.436711131056767e-07, + "loss": 0.0472, + "num_tokens": 1971197722.0, + "reward": 2.3504464626312256, + "reward_std": 0.44625774025917053, + "rewards/accuracy_reward/mean": 0.49537035822868347, + "rewards/accuracy_reward/std": 0.5005581974983215, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15035313367843628, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 901.419677734375, + "completions/mean_terminated_length": 706.830322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7650098556283629, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14981113750271696, + "kl": 0.029571533203125, + "learning_rate": 2.4342597056654555e-07, + "loss": 0.081, + "num_tokens": 1971670630.0, + "reward": 2.4637277126312256, + "reward_std": 0.3871533274650574, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1044.46875, + "completions/mean_terminated_length": 777.9943237304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7652229502956689, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11603017677169247, + "kl": 0.02740478515625, + "learning_rate": 2.431809976926735e-07, + "loss": 0.1155, + "num_tokens": 1972206088.0, + "reward": 2.368861675262451, + "reward_std": 0.40957650542259216, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.16255265474319458, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 919.0245971679688, + "completions/mean_terminated_length": 730.8619995117188, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7654360449629748, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1290785461980715, + "kl": 0.029083251953125, + "learning_rate": 2.429361946196342e-07, + "loss": 0.072, + "num_tokens": 1972683955.0, + "reward": 2.5518975257873535, + "reward_std": 0.36550506949424744, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11358113586902618, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1059.1451416015625, + "completions/mean_terminated_length": 837.5983276367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7656491396302808, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.113915854628643, + "kl": 0.02484130859375, + "learning_rate": 2.426915614829074e-07, + "loss": 0.0248, + "num_tokens": 1973233492.0, + "reward": 2.486049175262451, + "reward_std": 0.42285555601119995, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.08974618464708328, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 981.7188110351562, + "completions/mean_terminated_length": 777.5372314453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7658622342975867, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.130512319149885, + "kl": 0.0283203125, + "learning_rate": 2.424470984178789e-07, + "loss": 0.0732, + "num_tokens": 1973743910.0, + "reward": 2.3917412757873535, + "reward_std": 0.4645915925502777, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16545002162456512, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1091.4866943359375, + "completions/mean_terminated_length": 837.4971923828125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7660753289648926, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.10218472436779619, + "kl": 0.024078369140625, + "learning_rate": 2.422028055598403e-07, + "loss": 0.0378, + "num_tokens": 1974308464.0, + "reward": 2.2466518878936768, + "reward_std": 0.3809371292591095, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.14405618607997894, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1110.138427734375, + "completions/mean_terminated_length": 823.0379028320312, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.7662884236321986, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13361386804718603, + "kl": 0.02593994140625, + "learning_rate": 2.4195868304398877e-07, + "loss": 0.0707, + "num_tokens": 1974876334.0, + "reward": 2.26953125, + "reward_std": 0.44167739152908325, + "rewards/accuracy_reward/mean": 0.3660714328289032, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1571800261735916, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 910.7723388671875, + "completions/mean_terminated_length": 751.6183471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7665015182995045, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13812753928517843, + "kl": 0.029541015625, + "learning_rate": 2.417147310054277e-07, + "loss": 0.0488, + "num_tokens": 1975349432.0, + "reward": 2.4229912757873535, + "reward_std": 0.37660279870033264, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.13854306936264038, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1127.03125, + "completions/mean_terminated_length": 862.3850708007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7667146129668105, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10273798755348457, + "kl": 0.023468017578125, + "learning_rate": 2.414709495791659e-07, + "loss": 0.0227, + "num_tokens": 1975931974.0, + "reward": 2.3130581378936768, + "reward_std": 0.40218326449394226, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11801212280988693, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1145.8326416015625, + "completions/mean_terminated_length": 845.110107421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7669277076341164, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11078919554477011, + "kl": 0.024444580078125, + "learning_rate": 2.4122733890011764e-07, + "loss": 0.0522, + "num_tokens": 1976512587.0, + "reward": 2.3169643878936768, + "reward_std": 0.4617672860622406, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14140157401561737, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1048.5513916015625, + "completions/mean_terminated_length": 775.9744262695312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7671408023014225, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1176535199287874, + "kl": 0.026336669921875, + "learning_rate": 2.4098389910310264e-07, + "loss": 0.0666, + "num_tokens": 1977049506.0, + "reward": 2.407924175262451, + "reward_std": 0.48348382115364075, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.19128604233264923, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 896.857177734375, + "completions/mean_terminated_length": 780.8943481445312, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.7673538969687284, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14208384937400445, + "kl": 0.029144287109375, + "learning_rate": 2.4074063032284627e-07, + "loss": 0.0963, + "num_tokens": 1977517250.0, + "reward": 2.642857313156128, + "reward_std": 0.39701440930366516, + "rewards/accuracy_reward/mean": 0.7098214030265808, + "rewards/accuracy_reward/std": 0.4543519914150238, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.216333270072937, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10152243822813034, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 978.1116333007812, + "completions/mean_terminated_length": 756.0592651367188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7675669916360343, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11383303283371794, + "kl": 0.02655029296875, + "learning_rate": 2.404975326939795e-07, + "loss": 0.0617, + "num_tokens": 1978025748.0, + "reward": 2.4324777126312256, + "reward_std": 0.40557077527046204, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11660738289356232, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1047.107177734375, + "completions/mean_terminated_length": 795.4860229492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7677800863033403, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13607792291359277, + "kl": 0.02569580078125, + "learning_rate": 2.402546063510377e-07, + "loss": 0.0957, + "num_tokens": 1978571652.0, + "reward": 2.38671875, + "reward_std": 0.46462905406951904, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11601705849170685, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 924.0692138671875, + "completions/mean_terminated_length": 722.9447631835938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7679931809706462, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12167819209875655, + "kl": 0.0303955078125, + "learning_rate": 2.4001185142846244e-07, + "loss": 0.0769, + "num_tokens": 1979046323.0, + "reward": 2.486049175262451, + "reward_std": 0.34442827105522156, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10572549700737, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1018.9576416015625, + "completions/mean_terminated_length": 795.2527465820312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7682062756379522, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1346863233587877, + "kl": 0.0264892578125, + "learning_rate": 2.3976926806059983e-07, + "loss": 0.0861, + "num_tokens": 1979566000.0, + "reward": 2.3722100257873535, + "reward_std": 0.3897438943386078, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16768629848957062, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1065.859375, + "completions/mean_terminated_length": 835.8815307617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7684193703052581, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1352124405627495, + "kl": 0.027069091796875, + "learning_rate": 2.3952685638170127e-07, + "loss": 0.0869, + "num_tokens": 1980110545.0, + "reward": 2.439174175262451, + "reward_std": 0.4569382965564728, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15448829531669617, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 998.40185546875, + "completions/mean_terminated_length": 810.5789794921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7686324649725641, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1266860043322466, + "kl": 0.027923583984375, + "learning_rate": 2.392846165259229e-07, + "loss": 0.0715, + "num_tokens": 1980626133.0, + "reward": 2.4760046005249023, + "reward_std": 0.5225756168365479, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1590748131275177, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1048.65185546875, + "completions/mean_terminated_length": 837.9783935546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.76884555963987, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12133733303648937, + "kl": 0.023956298828125, + "learning_rate": 2.39042548627326e-07, + "loss": 0.0443, + "num_tokens": 1981171705.0, + "reward": 2.4854912757873535, + "reward_std": 0.48748332262039185, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.1251746416091919, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1031.493408203125, + "completions/mean_terminated_length": 757.9291381835938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.769058654307176, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14746039766802882, + "kl": 0.0281982421875, + "learning_rate": 2.3880065281987694e-07, + "loss": 0.0897, + "num_tokens": 1981698294.0, + "reward": 2.365513563156128, + "reward_std": 0.4907568097114563, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.17125900089740753, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1045.915283203125, + "completions/mean_terminated_length": 797.4874267578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7692717489744819, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11947018970142272, + "kl": 0.025970458984375, + "learning_rate": 2.3855892923744596e-07, + "loss": 0.085, + "num_tokens": 1982237968.0, + "reward": 2.2896206378936768, + "reward_std": 0.40791308879852295, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1422542929649353, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 963.310302734375, + "completions/mean_terminated_length": 792.3385009765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7694848436417878, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13723790537981842, + "kl": 0.031707763671875, + "learning_rate": 2.3831737801380902e-07, + "loss": 0.0772, + "num_tokens": 1982736651.0, + "reward": 2.435826063156128, + "reward_std": 0.4453217387199402, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1339094191789627, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1078.477783203125, + "completions/mean_terminated_length": 844.825439453125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7696979383090938, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13967204528406374, + "kl": 0.026153564453125, + "learning_rate": 2.3807599928264607e-07, + "loss": 0.1093, + "num_tokens": 1983292049.0, + "reward": 2.3058037757873535, + "reward_std": 0.47787559032440186, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.16626667976379395, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 884.4285888671875, + "completions/mean_terminated_length": 704.4948120117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7699110329763997, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14579676265594846, + "kl": 0.0313720703125, + "learning_rate": 2.3783479317754173e-07, + "loss": 0.0969, + "num_tokens": 1983761553.0, + "reward": 2.4291296005249023, + "reward_std": 0.4548097848892212, + "rewards/accuracy_reward/mean": 0.5486111044883728, + "rewards/accuracy_reward/std": 0.49820831418037415, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1454136222600937, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1099.102783203125, + "completions/mean_terminated_length": 840.3125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7701241276437057, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14073691009687478, + "kl": 0.02545166015625, + "learning_rate": 2.375937598319852e-07, + "loss": 0.1034, + "num_tokens": 1984326639.0, + "reward": 2.3236608505249023, + "reward_std": 0.44813287258148193, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14619384706020355, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 977.1183471679688, + "completions/mean_terminated_length": 733.602783203125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7703372223110116, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1216279343940387, + "kl": 0.02752685546875, + "learning_rate": 2.373528993793698e-07, + "loss": 0.0169, + "num_tokens": 1984833972.0, + "reward": 2.36328125, + "reward_std": 0.41990092396736145, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11088934540748596, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1068.2076416015625, + "completions/mean_terminated_length": 786.6580200195312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7705503169783177, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12228746702556395, + "kl": 0.024993896484375, + "learning_rate": 2.3711221195299366e-07, + "loss": 0.0552, + "num_tokens": 1985382305.0, + "reward": 2.3588171005249023, + "reward_std": 0.43226316571235657, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14748506247997284, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 953.044677734375, + "completions/mean_terminated_length": 770.5521240234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7707634116456236, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1274991674342064, + "kl": 0.027740478515625, + "learning_rate": 2.368716976860588e-07, + "loss": 0.0131, + "num_tokens": 1985876341.0, + "reward": 2.4542412757873535, + "reward_std": 0.41259995102882385, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.1151164099574089, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1006.15185546875, + "completions/mean_terminated_length": 800.0106811523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7709765063129295, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13531107781855503, + "kl": 0.028533935546875, + "learning_rate": 2.3663135671167106e-07, + "loss": 0.0406, + "num_tokens": 1986404473.0, + "reward": 2.3738839626312256, + "reward_std": 0.3625422418117523, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10404275357723236, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1004.8594360351562, + "completions/mean_terminated_length": 774.62939453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7711896009802355, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12746021769799984, + "kl": 0.028106689453125, + "learning_rate": 2.3639118916284122e-07, + "loss": 0.0557, + "num_tokens": 1986918794.0, + "reward": 2.4135046005249023, + "reward_std": 0.48709607124328613, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1400342434644699, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1077.8148193359375, + "completions/mean_terminated_length": 791.8063354492188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.7714026956475414, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11747572131021242, + "kl": 0.027587890625, + "learning_rate": 2.3615119517248344e-07, + "loss": 0.0222, + "num_tokens": 1987470599.0, + "reward": 2.41015625, + "reward_std": 0.4105912446975708, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661912620067596, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 869.8928833007812, + "completions/mean_terminated_length": 718.549072265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7716157903148474, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13613917443582463, + "kl": 0.030517578125, + "learning_rate": 2.3591137487341613e-07, + "loss": 0.1153, + "num_tokens": 1987924887.0, + "reward": 2.6434152126312256, + "reward_std": 0.366009384393692, + "rewards/accuracy_reward/mean": 0.7120535969734192, + "rewards/accuracy_reward/std": 0.4533122181892395, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10854540765285492, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1080.6273193359375, + "completions/mean_terminated_length": 854.107421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7718288849821533, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12550457893038833, + "kl": 0.023773193359375, + "learning_rate": 2.356717283983613e-07, + "loss": 0.0763, + "num_tokens": 1988480544.0, + "reward": 2.2823662757873535, + "reward_std": 0.43554142117500305, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1502326875925064, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1054.696533203125, + "completions/mean_terminated_length": 873.8575439453125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7720419796494593, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11276633326191235, + "kl": 0.025177001953125, + "learning_rate": 2.354322558799449e-07, + "loss": 0.0419, + "num_tokens": 1989021128.0, + "reward": 2.3794643878936768, + "reward_std": 0.3929978013038635, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09478133171796799, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1020.169677734375, + "completions/mean_terminated_length": 829.8306884765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7722550743167652, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11493500529344396, + "kl": 0.02374267578125, + "learning_rate": 2.351929574506969e-07, + "loss": 0.0031, + "num_tokens": 1989548884.0, + "reward": 2.529017925262451, + "reward_std": 0.3697143495082855, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1282729059457779, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 980.6406860351562, + "completions/mean_terminated_length": 779.6259765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7724681689840712, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12700642223122208, + "kl": 0.027374267578125, + "learning_rate": 2.3495383324305058e-07, + "loss": 0.0316, + "num_tokens": 1990061571.0, + "reward": 2.5390625, + "reward_std": 0.3659279942512512, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389531940221786, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1062.805908203125, + "completions/mean_terminated_length": 842.0792236328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7726812636513771, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12953766316555412, + "kl": 0.024932861328125, + "learning_rate": 2.347148833893428e-07, + "loss": 0.0952, + "num_tokens": 1990605596.0, + "reward": 2.4112725257873535, + "reward_std": 0.47081342339515686, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15641570091247559, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 937.529052734375, + "completions/mean_terminated_length": 717.8101806640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.772894358318683, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.4539489850730734, + "kl": 0.031402587890625, + "learning_rate": 2.344761080218139e-07, + "loss": 0.0757, + "num_tokens": 1991096457.0, + "reward": 2.39453125, + "reward_std": 0.37792688608169556, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12315750867128372, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 989.8192138671875, + "completions/mean_terminated_length": 770.19677734375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.773107452985989, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12234801992637155, + "kl": 0.02783203125, + "learning_rate": 2.3423750727260813e-07, + "loss": 0.0676, + "num_tokens": 1991604728.0, + "reward": 2.541294813156128, + "reward_std": 0.4165656864643097, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13351379334926605, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 941.7210083007812, + "completions/mean_terminated_length": 757.3411865234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7733205476532949, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14182930023237067, + "kl": 0.027496337890625, + "learning_rate": 2.3399908127377246e-07, + "loss": 0.0606, + "num_tokens": 1992093083.0, + "reward": 2.4402902126312256, + "reward_std": 0.3861028850078583, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12956927716732025, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 945.2277221679688, + "completions/mean_terminated_length": 751.3018188476562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.773533642320601, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13044873391044542, + "kl": 0.029998779296875, + "learning_rate": 2.3376083015725737e-07, + "loss": 0.0592, + "num_tokens": 1992585857.0, + "reward": 2.3214287757873535, + "reward_std": 0.40314850211143494, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16145165264606476, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1006.482177734375, + "completions/mean_terminated_length": 786.9189453125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7737467369879069, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13866862165000746, + "kl": 0.02685546875, + "learning_rate": 2.3352275405491683e-07, + "loss": 0.0803, + "num_tokens": 1993103561.0, + "reward": 2.53515625, + "reward_std": 0.41071146726608276, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12979069352149963, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 888.0781860351562, + "completions/mean_terminated_length": 748.8875122070312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7739598316552129, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1328342487872296, + "kl": 0.032989501953125, + "learning_rate": 2.3328485309850772e-07, + "loss": 0.0585, + "num_tokens": 1993567628.0, + "reward": 2.5262277126312256, + "reward_std": 0.39783385396003723, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1277548223733902, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1102.22998046875, + "completions/mean_terminated_length": 819.8695678710938, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7741729263225188, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11539596183914869, + "kl": 0.02471923828125, + "learning_rate": 2.3304712741968992e-07, + "loss": 0.0406, + "num_tokens": 1994134547.0, + "reward": 2.325892925262451, + "reward_std": 0.3473705053329468, + "rewards/accuracy_reward/mean": 0.3928571343421936, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09178353101015091, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 872.5960083007812, + "completions/mean_terminated_length": 772.9855346679688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.7743860209898247, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13167877442073989, + "kl": 0.033233642578125, + "learning_rate": 2.3280957715002638e-07, + "loss": 0.0468, + "num_tokens": 1994586606.0, + "reward": 2.5965402126312256, + "reward_std": 0.33661067485809326, + "rewards/accuracy_reward/mean": 0.6651785969734192, + "rewards/accuracy_reward/std": 0.47245556116104126, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10326440632343292, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 965.52685546875, + "completions/mean_terminated_length": 781.8172607421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7745991156571307, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12956765625867853, + "kl": 0.028228759765625, + "learning_rate": 2.3257220242098294e-07, + "loss": 0.0671, + "num_tokens": 1995089018.0, + "reward": 2.3247768878936768, + "reward_std": 0.5176705121994019, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.1785159856081009, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 994.3750610351562, + "completions/mean_terminated_length": 799.2592163085938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7748122103244366, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12657641872165445, + "kl": 0.0302734375, + "learning_rate": 2.323350033639287e-07, + "loss": 0.0978, + "num_tokens": 1995606946.0, + "reward": 2.5078125, + "reward_std": 0.46482858061790466, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12805373966693878, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 987.357177734375, + "completions/mean_terminated_length": 797.5579223632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7750253049917426, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13239562371121621, + "kl": 0.02752685546875, + "learning_rate": 2.3209798011013458e-07, + "loss": 0.1171, + "num_tokens": 1996119730.0, + "reward": 2.453125, + "reward_std": 0.4176524877548218, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.14753690361976624, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 951.6607666015625, + "completions/mean_terminated_length": 738.239990234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7752383996590485, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12646928223688847, + "kl": 0.0289306640625, + "learning_rate": 2.318611327907753e-07, + "loss": 0.0511, + "num_tokens": 1996612122.0, + "reward": 2.4135046005249023, + "reward_std": 0.3974364697933197, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12909629940986633, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 983.6652221679688, + "completions/mean_terminated_length": 789.8944702148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7754514943263545, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15258903985359568, + "kl": 0.027984619140625, + "learning_rate": 2.316244615369276e-07, + "loss": 0.0741, + "num_tokens": 1997121876.0, + "reward": 2.4268975257873535, + "reward_std": 0.4359312951564789, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14680634438991547, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1171.825927734375, + "completions/mean_terminated_length": 883.2344360351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7756645889936604, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1249174755472304, + "kl": 0.021881103515625, + "learning_rate": 2.313879664795709e-07, + "loss": 0.076, + "num_tokens": 1997727270.0, + "reward": 2.333705425262451, + "reward_std": 0.4284088909626007, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15247619152069092, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 881.9553833007812, + "completions/mean_terminated_length": 662.3554077148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7758776836609664, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14867666267634752, + "kl": 0.029296875, + "learning_rate": 2.3115164774958702e-07, + "loss": 0.1024, + "num_tokens": 1998193442.0, + "reward": 2.3761162757873535, + "reward_std": 0.41546785831451416, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15155641734600067, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 989.0960083007812, + "completions/mean_terminated_length": 789.6737060546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7760907783282723, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12126398754873464, + "kl": 0.029205322265625, + "learning_rate": 2.3091550547776023e-07, + "loss": 0.0332, + "num_tokens": 1998705789.0, + "reward": 2.5396206378936768, + "reward_std": 0.41325685381889343, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12130890786647797, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 879.3482666015625, + "completions/mean_terminated_length": 739.1099853515625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7763038729955782, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1348613642027373, + "kl": 0.0330810546875, + "learning_rate": 2.3067953979477747e-07, + "loss": 0.0594, + "num_tokens": 1999164969.0, + "reward": 2.4955358505249023, + "reward_std": 0.3621552586555481, + "rewards/accuracy_reward/mean": 0.6157407164573669, + "rewards/accuracy_reward/std": 0.48698362708091736, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 941.1607666015625, + "completions/mean_terminated_length": 776.5538940429688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7765169676628842, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1290798824179389, + "kl": 0.029510498046875, + "learning_rate": 2.304437508312275e-07, + "loss": 0.0633, + "num_tokens": 1999654449.0, + "reward": 2.4095983505249023, + "reward_std": 0.41413557529449463, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1253739446401596, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1144.859375, + "completions/mean_terminated_length": 871.8168334960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7767300623301902, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12082092667679627, + "kl": 0.024749755859375, + "learning_rate": 2.3020813871760157e-07, + "loss": 0.0711, + "num_tokens": 2000238882.0, + "reward": 2.368861675262451, + "reward_std": 0.4545716345310211, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1590748131275177, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 954.8058471679688, + "completions/mean_terminated_length": 765.9293212890625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7769431569974962, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12355079622083395, + "kl": 0.030517578125, + "learning_rate": 2.2997270358429283e-07, + "loss": 0.0449, + "num_tokens": 2000738667.0, + "reward": 2.5200893878936768, + "reward_std": 0.4130350649356842, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10289046913385391, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 981.4152221679688, + "completions/mean_terminated_length": 753.0677490234375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7771562516648021, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1374693830646525, + "kl": 0.028717041015625, + "learning_rate": 2.2973744556159668e-07, + "loss": 0.1158, + "num_tokens": 2001240261.0, + "reward": 2.462611675262451, + "reward_std": 0.42233961820602417, + "rewards/accuracy_reward/mean": 0.5995370149612427, + "rewards/accuracy_reward/std": 0.4905603229999542, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.141964390873909, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 971.4085083007812, + "completions/mean_terminated_length": 754.9356689453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7773693463321081, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14054032119794602, + "kl": 0.028350830078125, + "learning_rate": 2.2950236477971035e-07, + "loss": 0.0882, + "num_tokens": 2001748428.0, + "reward": 2.4654018878936768, + "reward_std": 0.42282602190971375, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14929908514022827, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 959.7277221679688, + "completions/mean_terminated_length": 758.1957397460938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.777582440999414, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12800211027742053, + "kl": 0.029632568359375, + "learning_rate": 2.2926746136873305e-07, + "loss": 0.0499, + "num_tokens": 2002244354.0, + "reward": 2.4916296005249023, + "reward_std": 0.3784066140651703, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12292414903640747, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 966.4553833007812, + "completions/mean_terminated_length": 779.5916137695312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.77779553566672, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11776831389861644, + "kl": 0.02777099609375, + "learning_rate": 2.29032735458666e-07, + "loss": 0.0313, + "num_tokens": 2002743198.0, + "reward": 2.486049175262451, + "reward_std": 0.3369090259075165, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11214316636323929, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1019.825927734375, + "completions/mean_terminated_length": 806.4312744140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7780086303340259, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.2513765332487112, + "kl": 0.039306640625, + "learning_rate": 2.2879818717941193e-07, + "loss": 0.0801, + "num_tokens": 2003268400.0, + "reward": 2.4268975257873535, + "reward_std": 0.3885403573513031, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1273927539587021, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1058.8348388671875, + "completions/mean_terminated_length": 863.11767578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7782217250013318, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13243359433625523, + "kl": 0.026763916015625, + "learning_rate": 2.2856381666077533e-07, + "loss": 0.0771, + "num_tokens": 2003810902.0, + "reward": 2.5066964626312256, + "reward_std": 0.4575176537036896, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13422717154026031, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 987.4397583007812, + "completions/mean_terminated_length": 804.2015991210938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7784348196686378, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14586084636403504, + "kl": 0.02801513671875, + "learning_rate": 2.283296240324624e-07, + "loss": 0.0757, + "num_tokens": 2004323131.0, + "reward": 2.4308037757873535, + "reward_std": 0.42310458421707153, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1480942964553833, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1115.607177734375, + "completions/mean_terminated_length": 837.2406005859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7786479143359437, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10713449031580556, + "kl": 0.02374267578125, + "learning_rate": 2.2809560942408064e-07, + "loss": 0.1111, + "num_tokens": 2004895835.0, + "reward": 2.3677456378936768, + "reward_std": 0.39975327253341675, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11660738289356232, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1134.997802734375, + "completions/mean_terminated_length": 862.4203491210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7788610090032497, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1320323343113309, + "kl": 0.025543212890625, + "learning_rate": 2.2786177296513967e-07, + "loss": 0.0904, + "num_tokens": 2005473098.0, + "reward": 2.365513563156128, + "reward_std": 0.4179643988609314, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11898136883974075, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 975.55810546875, + "completions/mean_terminated_length": 766.789306640625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7790741036705556, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1280844386746548, + "kl": 0.028228759765625, + "learning_rate": 2.276281147850495e-07, + "loss": 0.0305, + "num_tokens": 2005974356.0, + "reward": 2.4776787757873535, + "reward_std": 0.35402682423591614, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09137456864118576, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 974.7120971679688, + "completions/mean_terminated_length": 758.9035034179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7792871983378616, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14623424390506726, + "kl": 0.030487060546875, + "learning_rate": 2.2739463501312245e-07, + "loss": 0.0807, + "num_tokens": 2006482035.0, + "reward": 2.5479912757873535, + "reward_std": 0.4567717909812927, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.4769369065761566, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13455694913864136, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 897.5982666015625, + "completions/mean_terminated_length": 655.0811157226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7795002930051675, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14366237493308276, + "kl": 0.031890869140625, + "learning_rate": 2.271613337785716e-07, + "loss": 0.0686, + "num_tokens": 2006948911.0, + "reward": 2.5206475257873535, + "reward_std": 0.375855028629303, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517839670181274, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1167.1451416015625, + "completions/mean_terminated_length": 883.9203491210938, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7797133876724734, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12655777725008752, + "kl": 0.0240478515625, + "learning_rate": 2.2692821121051133e-07, + "loss": 0.0763, + "num_tokens": 2007538256.0, + "reward": 2.3878350257873535, + "reward_std": 0.46959805488586426, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16374634206295013, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1101.3773193359375, + "completions/mean_terminated_length": 839.77490234375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7799264823397795, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12352040421287824, + "kl": 0.026763916015625, + "learning_rate": 2.266952674379571e-07, + "loss": 0.0815, + "num_tokens": 2008098857.0, + "reward": 2.3722100257873535, + "reward_std": 0.4799840748310089, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14461299777030945, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1060.430908203125, + "completions/mean_terminated_length": 812.1591796875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7801395770070854, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1641267105861403, + "kl": 0.030364990234375, + "learning_rate": 2.2646250258982536e-07, + "loss": 0.0677, + "num_tokens": 2008650250.0, + "reward": 2.361607313156128, + "reward_std": 0.49516233801841736, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15883232653141022, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1031.640625, + "completions/mean_terminated_length": 820.6981201171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7803526716743914, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13777478562514822, + "kl": 0.02581787109375, + "learning_rate": 2.2622991679493388e-07, + "loss": 0.0633, + "num_tokens": 2009180633.0, + "reward": 2.3203125, + "reward_std": 0.4260157346725464, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1569942682981491, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 872.997802734375, + "completions/mean_terminated_length": 731.9974975585938, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.7805657663416973, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12872747447833294, + "kl": 0.031158447265625, + "learning_rate": 2.2599751018200094e-07, + "loss": 0.0115, + "num_tokens": 2009638200.0, + "reward": 2.5262277126312256, + "reward_std": 0.3268493413925171, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.13086353242397308, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1036.4107666015625, + "completions/mean_terminated_length": 789.1333618164062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7807788610090033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1448101448321292, + "kl": 0.028839111328125, + "learning_rate": 2.257652828796459e-07, + "loss": 0.0808, + "num_tokens": 2010177504.0, + "reward": 2.3253350257873535, + "reward_std": 0.48469600081443787, + "rewards/accuracy_reward/mean": 0.4583333432674408, + "rewards/accuracy_reward/std": 0.4988385736942291, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1508246213197708, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1051.0223388671875, + "completions/mean_terminated_length": 827.6557006835938, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7809919556763092, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11972620554431765, + "kl": 0.02435302734375, + "learning_rate": 2.2553323501638865e-07, + "loss": 0.0501, + "num_tokens": 2010724442.0, + "reward": 2.3950893878936768, + "reward_std": 0.41521820425987244, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14626215398311615, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1126.1004638671875, + "completions/mean_terminated_length": 861.186767578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7812050503436152, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12409003575996988, + "kl": 0.024627685546875, + "learning_rate": 2.2530136672065004e-07, + "loss": 0.0778, + "num_tokens": 2011294039.0, + "reward": 2.349888563156128, + "reward_std": 0.4730124771595001, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14515583217144012, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1002.33935546875, + "completions/mean_terminated_length": 739.4636840820312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7814181450109211, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12166091373410194, + "kl": 0.027130126953125, + "learning_rate": 2.2506967812075142e-07, + "loss": 0.0724, + "num_tokens": 2011811135.0, + "reward": 2.453125, + "reward_std": 0.4462078809738159, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17205262184143066, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1043.0648193359375, + "completions/mean_terminated_length": 797.4138793945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.781631239678227, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11743546623208966, + "kl": 0.0279541015625, + "learning_rate": 2.2483816934491449e-07, + "loss": 0.0568, + "num_tokens": 2012343676.0, + "reward": 2.407924175262451, + "reward_std": 0.4028477072715759, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13776934146881104, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1113.84375, + "completions/mean_terminated_length": 862.44189453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.781844334345533, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11042146865188092, + "kl": 0.024444580078125, + "learning_rate": 2.2460684052126195e-07, + "loss": 0.0655, + "num_tokens": 2012921238.0, + "reward": 2.3565850257873535, + "reward_std": 0.4532468914985657, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1422542929649353, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1136.0960693359375, + "completions/mean_terminated_length": 884.0883178710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7820574290128389, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11780196591125137, + "kl": 0.0245361328125, + "learning_rate": 2.2437569177781663e-07, + "loss": 0.0575, + "num_tokens": 2013504385.0, + "reward": 2.3046875, + "reward_std": 0.3964941203594208, + "rewards/accuracy_reward/mean": 0.4166666567325592, + "rewards/accuracy_reward/std": 0.4935782551765442, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14164415001869202, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1031.7232666015625, + "completions/mean_terminated_length": 765.4873046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7822705236801449, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1974781473347365, + "kl": 0.043060302734375, + "learning_rate": 2.2414472324250117e-07, + "loss": 0.0773, + "num_tokens": 2014035461.0, + "reward": 2.3470983505249023, + "reward_std": 0.4678664803504944, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.1816219538450241, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1020.87060546875, + "completions/mean_terminated_length": 748.1299438476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7824836183474508, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12375618884014736, + "kl": 0.02838134765625, + "learning_rate": 2.239139350431395e-07, + "loss": 0.0459, + "num_tokens": 2014569435.0, + "reward": 2.353794813156128, + "reward_std": 0.35637223720550537, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10537806153297424, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1010.3951416015625, + "completions/mean_terminated_length": 784.8287963867188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7826967130147569, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12761523859488438, + "kl": 0.027923583984375, + "learning_rate": 2.2368332730745483e-07, + "loss": 0.082, + "num_tokens": 2015089836.0, + "reward": 2.458705425262451, + "reward_std": 0.40176674723625183, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 920.794677734375, + "completions/mean_terminated_length": 782.3659057617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7829098076820628, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13777751915720313, + "kl": 0.0279541015625, + "learning_rate": 2.2345290016307138e-07, + "loss": 0.0507, + "num_tokens": 2015572720.0, + "reward": 2.482142925262451, + "reward_std": 0.39352041482925415, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1299937218427658, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 954.3192138671875, + "completions/mean_terminated_length": 734.4102172851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7831229023493687, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13990265677542843, + "kl": 0.029571533203125, + "learning_rate": 2.2322265373751238e-07, + "loss": 0.1061, + "num_tokens": 2016064287.0, + "reward": 2.3995537757873535, + "reward_std": 0.48027709126472473, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15883232653141022, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1058.0067138671875, + "completions/mean_terminated_length": 832.8849487304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7833359970166747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13810124635676385, + "kl": 0.02728271484375, + "learning_rate": 2.22992588158202e-07, + "loss": 0.0608, + "num_tokens": 2016604258.0, + "reward": 2.239955425262451, + "reward_std": 0.4449780583381653, + "rewards/accuracy_reward/mean": 0.3727678656578064, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1591111123561859, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1066.203125, + "completions/mean_terminated_length": 809.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7835490916839806, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1219721677944319, + "kl": 0.025390625, + "learning_rate": 2.2276270355246374e-07, + "loss": 0.0726, + "num_tokens": 2017158909.0, + "reward": 2.37109375, + "reward_std": 0.4210568964481354, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15941192209720612, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1013.9910888671875, + "completions/mean_terminated_length": 812.7039794921875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7837621863512866, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11583766678287374, + "kl": 0.026336669921875, + "learning_rate": 2.2253300004752125e-07, + "loss": 0.0695, + "num_tokens": 2017682521.0, + "reward": 2.4369421005249023, + "reward_std": 0.4287792146205902, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12540756165981293, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 981.5870971679688, + "completions/mean_terminated_length": 800.6031494140625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7839752810185925, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1418503990547118, + "kl": 0.02716064453125, + "learning_rate": 2.2230347777049768e-07, + "loss": 0.087, + "num_tokens": 2018192496.0, + "reward": 2.3833706378936768, + "reward_std": 0.5248706936836243, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.15765586495399475, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 962.0313110351562, + "completions/mean_terminated_length": 777.7284545898438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7841883756858985, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1333875099276704, + "kl": 0.0286865234375, + "learning_rate": 2.2207413684841607e-07, + "loss": 0.0776, + "num_tokens": 2018691390.0, + "reward": 2.30078125, + "reward_std": 0.4338666498661041, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 902.8214721679688, + "completions/mean_terminated_length": 762.1854858398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7844014703532044, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14520749362410013, + "kl": 0.030120849609375, + "learning_rate": 2.218449774081994e-07, + "loss": 0.084, + "num_tokens": 2019160462.0, + "reward": 2.549107313156128, + "reward_std": 0.5027464032173157, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15456202626228333, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 899.82373046875, + "completions/mean_terminated_length": 683.5888671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7846145650205104, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12764753318983862, + "kl": 0.028656005859375, + "learning_rate": 2.2161599957666944e-07, + "loss": 0.0437, + "num_tokens": 2019631167.0, + "reward": 2.4246652126312256, + "reward_std": 0.3342822790145874, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10500273108482361, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1065.279052734375, + "completions/mean_terminated_length": 835.165283203125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7848276596878163, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1135853342225109, + "kl": 0.025726318359375, + "learning_rate": 2.2138720348054796e-07, + "loss": 0.0671, + "num_tokens": 2020176108.0, + "reward": 2.263951063156128, + "reward_std": 0.4311540722846985, + "rewards/accuracy_reward/mean": 0.3772321343421936, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14869897067546844, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 994.904052734375, + "completions/mean_terminated_length": 786.5374755859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7850407543551222, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12697064415422812, + "kl": 0.026123046875, + "learning_rate": 2.2115858924645635e-07, + "loss": 0.0537, + "num_tokens": 2020703201.0, + "reward": 2.5011162757873535, + "reward_std": 0.39964696764945984, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10269007831811905, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 991.7745971679688, + "completions/mean_terminated_length": 751.591796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7852538490224282, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13340527552487247, + "kl": 0.029144287109375, + "learning_rate": 2.209301570009149e-07, + "loss": 0.0945, + "num_tokens": 2021212524.0, + "reward": 2.48046875, + "reward_std": 0.4135218560695648, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12909629940986633, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 870.6741333007812, + "completions/mean_terminated_length": 729.3949584960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7854669436897341, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12423435706545113, + "kl": 0.0306396484375, + "learning_rate": 2.2070190687034334e-07, + "loss": 0.0459, + "num_tokens": 2021675066.0, + "reward": 2.6099331378936768, + "reward_std": 0.34556421637535095, + "rewards/accuracy_reward/mean": 0.6785714030265808, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10724954307079315, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 944.9442138671875, + "completions/mean_terminated_length": 777.6427001953125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.7856800383570401, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13382084548346312, + "kl": 0.02935791015625, + "learning_rate": 2.2047383898106065e-07, + "loss": 0.0479, + "num_tokens": 2022164705.0, + "reward": 2.4888393878936768, + "reward_std": 0.4068002998828888, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855977296829224, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09624522179365158, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1041.22998046875, + "completions/mean_terminated_length": 812.2931518554688, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.785893133024346, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1119449561398047, + "kl": 0.027008056640625, + "learning_rate": 2.202459534592851e-07, + "loss": 0.0281, + "num_tokens": 2022703384.0, + "reward": 2.4213171005249023, + "reward_std": 0.4229731857776642, + "rewards/accuracy_reward/mean": 0.5208333134651184, + "rewards/accuracy_reward/std": 0.5001450181007385, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10481233894824982, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 967.591552734375, + "completions/mean_terminated_length": 757.27197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7861062276916521, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1470578422340937, + "kl": 0.033721923828125, + "learning_rate": 2.2001825043113393e-07, + "loss": 0.0995, + "num_tokens": 2023204177.0, + "reward": 2.4129464626312256, + "reward_std": 0.42100703716278076, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13318143784999847, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 1068.7366943359375, + "completions/mean_terminated_length": 787.3390502929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.786319322358958, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1196608706978701, + "kl": 0.024627685546875, + "learning_rate": 2.1979073002262294e-07, + "loss": 0.0837, + "num_tokens": 2023755707.0, + "reward": 2.4486608505249023, + "reward_std": 0.3850461542606354, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.19128035008907318, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09730303287506104, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 958.3035888671875, + "completions/mean_terminated_length": 773.3681640625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7865324170262639, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12130150411124722, + "kl": 0.02642822265625, + "learning_rate": 2.1956339235966764e-07, + "loss": 0.073, + "num_tokens": 2024252691.0, + "reward": 2.5005581378936768, + "reward_std": 0.32296568155288696, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12292414158582687, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 916.0469360351562, + "completions/mean_terminated_length": 751.0307006835938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7867455116935699, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11505428708261616, + "kl": 0.03076171875, + "learning_rate": 2.1933623756808193e-07, + "loss": 0.0301, + "num_tokens": 2024729992.0, + "reward": 2.4464287757873535, + "reward_std": 0.3755577802658081, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08382127434015274, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 903.872802734375, + "completions/mean_terminated_length": 695.5752563476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7869586063608758, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13296898931208118, + "kl": 0.033599853515625, + "learning_rate": 2.1910926577357858e-07, + "loss": 0.0373, + "num_tokens": 2025204735.0, + "reward": 2.322544813156128, + "reward_std": 0.42915159463882446, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1292569637298584, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1077.5223388671875, + "completions/mean_terminated_length": 826.7247314453125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7871717010281818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14054347357368147, + "kl": 0.02691650390625, + "learning_rate": 2.1888247710176905e-07, + "loss": 0.0776, + "num_tokens": 2025761785.0, + "reward": 2.3699777126312256, + "reward_std": 0.5200629234313965, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15792487561702728, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 883.841552734375, + "completions/mean_terminated_length": 753.8486328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7873847956954877, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13327169568288547, + "kl": 0.029388427734375, + "learning_rate": 2.1865587167816346e-07, + "loss": 0.0492, + "num_tokens": 2026231090.0, + "reward": 2.333705425262451, + "reward_std": 0.3993498980998993, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.8816964030265808, + "rewards/format_reward/std": 0.32332828640937805, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.15497742593288422, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 957.79248046875, + "completions/mean_terminated_length": 795.6589965820312, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7875978903627937, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13060439677947236, + "kl": 0.027679443359375, + "learning_rate": 2.1842944962817077e-07, + "loss": 0.0829, + "num_tokens": 2026728981.0, + "reward": 2.5736608505249023, + "reward_std": 0.43459370732307434, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.46923142671585083, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1086.435302734375, + "completions/mean_terminated_length": 851.3861083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7878109850300996, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11605514449486162, + "kl": 0.0260009765625, + "learning_rate": 2.1820321107709817e-07, + "loss": 0.0571, + "num_tokens": 2027290696.0, + "reward": 2.3130581378936768, + "reward_std": 0.43512123823165894, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517839670181274, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 896.325927734375, + "completions/mean_terminated_length": 686.6543579101562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7880240796974056, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14635062310934602, + "kl": 0.031280517578125, + "learning_rate": 2.179771561501514e-07, + "loss": 0.0616, + "num_tokens": 2027767098.0, + "reward": 2.490513563156128, + "reward_std": 0.42108434438705444, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13153910636901855, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 923.1785888671875, + "completions/mean_terminated_length": 742.5077514648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7882371743647115, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12043100363810931, + "kl": 0.029083251953125, + "learning_rate": 2.1775128497243445e-07, + "loss": 0.0485, + "num_tokens": 2028247722.0, + "reward": 2.4720983505249023, + "reward_std": 0.4003392457962036, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13083133101463318, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1034.546875, + "completions/mean_terminated_length": 797.2369384765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7884502690320174, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1310006120665886, + "kl": 0.02679443359375, + "learning_rate": 2.1752559766894978e-07, + "loss": 0.0751, + "num_tokens": 2028784079.0, + "reward": 2.310267925262451, + "reward_std": 0.4275436997413635, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14383503794670105, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1150.6629638671875, + "completions/mean_terminated_length": 795.6417236328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7886633636993234, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13043937884250986, + "kl": 0.023773193359375, + "learning_rate": 2.1730009436459812e-07, + "loss": 0.1232, + "num_tokens": 2029366968.0, + "reward": 2.2566964626312256, + "reward_std": 0.5472320914268494, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.18130891025066376, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1035.357177734375, + "completions/mean_terminated_length": 847.8306884765625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7888764583666293, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11786473367710062, + "kl": 0.0260009765625, + "learning_rate": 2.1707477518417806e-07, + "loss": 0.0888, + "num_tokens": 2029901864.0, + "reward": 2.4497768878936768, + "reward_std": 0.37727296352386475, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11015089601278305, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1061.2210693359375, + "completions/mean_terminated_length": 887.6929321289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7890895530339354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13375056899691412, + "kl": 0.025421142578125, + "learning_rate": 2.1684964025238684e-07, + "loss": 0.0562, + "num_tokens": 2030447803.0, + "reward": 2.478236675262451, + "reward_std": 0.42049261927604675, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12338031083345413, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 971.33935546875, + "completions/mean_terminated_length": 791.8958740234375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7893026477012413, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12858110366780875, + "kl": 0.02825927734375, + "learning_rate": 2.166246896938192e-07, + "loss": 0.0667, + "num_tokens": 2030950275.0, + "reward": 2.450892925262451, + "reward_std": 0.49948036670684814, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14187753200531006, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 936.3348388671875, + "completions/mean_terminated_length": 737.4052734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7895157423685473, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14633276212892932, + "kl": 0.033294677734375, + "learning_rate": 2.163999236329681e-07, + "loss": 0.088, + "num_tokens": 2031433977.0, + "reward": 2.5396206378936768, + "reward_std": 0.48697176575660706, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15557540953159332, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1003.810302734375, + "completions/mean_terminated_length": 803.8590087890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7897288370358532, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12174580562225036, + "kl": 0.025146484375, + "learning_rate": 2.1617534219422445e-07, + "loss": 0.0556, + "num_tokens": 2031958724.0, + "reward": 2.3900671005249023, + "reward_std": 0.35458117723464966, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11214316636323929, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1006.4576416015625, + "completions/mean_terminated_length": 790.2883911132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7899419317031592, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12339277743912677, + "kl": 0.027801513671875, + "learning_rate": 2.159509455018766e-07, + "loss": 0.0387, + "num_tokens": 2032481793.0, + "reward": 2.4765625, + "reward_std": 0.48525917530059814, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12585100531578064, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 985.9397583007812, + "completions/mean_terminated_length": 789.2619018554688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.7901550263704651, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.2479972637988037, + "kl": 0.02691650390625, + "learning_rate": 2.1572673368011141e-07, + "loss": 0.0374, + "num_tokens": 2032989526.0, + "reward": 2.490513563156128, + "reward_std": 0.41473469138145447, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11660738289356232, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1100.2410888671875, + "completions/mean_terminated_length": 918.7553100585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.790368121037771, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11057306656234722, + "kl": 0.02630615234375, + "learning_rate": 2.1550270685301248e-07, + "loss": 0.034, + "num_tokens": 2033552466.0, + "reward": 2.4151787757873535, + "reward_std": 0.403978556394577, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11849904805421829, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 915.1897583007812, + "completions/mean_terminated_length": 743.3753051757812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.790581215705077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14854224376765482, + "kl": 0.0350341796875, + "learning_rate": 2.1527886514456178e-07, + "loss": 0.0571, + "num_tokens": 2034026679.0, + "reward": 2.5558037757873535, + "reward_std": 0.4550960063934326, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13989263772964478, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 949.05810546875, + "completions/mean_terminated_length": 762.5535278320312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7907943103723829, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1263606279364349, + "kl": 0.028778076171875, + "learning_rate": 2.150552086786385e-07, + "loss": 0.0869, + "num_tokens": 2034515489.0, + "reward": 2.4799108505249023, + "reward_std": 0.441692054271698, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14238695800304413, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1065.9710693359375, + "completions/mean_terminated_length": 836.019287109375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7910074050396889, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12221103511043187, + "kl": 0.024169921875, + "learning_rate": 2.1483173757901947e-07, + "loss": 0.049, + "num_tokens": 2035062452.0, + "reward": 2.38671875, + "reward_std": 0.3715497851371765, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.09848741441965103, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1068.01123046875, + "completions/mean_terminated_length": 838.5372314453125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7912204997069948, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12515750330428307, + "kl": 0.024932861328125, + "learning_rate": 2.146084519693787e-07, + "loss": 0.0741, + "num_tokens": 2035608425.0, + "reward": 2.486049175262451, + "reward_std": 0.44873178005218506, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12086557596921921, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 952.372802734375, + "completions/mean_terminated_length": 779.677001953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7914335943743008, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1408033120560987, + "kl": 0.02996826171875, + "learning_rate": 2.1438535197328755e-07, + "loss": 0.0732, + "num_tokens": 2036106720.0, + "reward": 2.4994421005249023, + "reward_std": 0.4345710873603821, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.12992529571056366, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 938.497802734375, + "completions/mean_terminated_length": 760.2875366210938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7916466890416067, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1371792415767387, + "kl": 0.03082275390625, + "learning_rate": 2.1416243771421517e-07, + "loss": 0.1118, + "num_tokens": 2036589951.0, + "reward": 2.5691964626312256, + "reward_std": 0.47533902525901794, + "rewards/accuracy_reward/mean": 0.7008928656578064, + "rewards/accuracy_reward/std": 0.45837873220443726, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.1671055108308792, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 985.5313110351562, + "completions/mean_terminated_length": 754.559814453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7918597837089126, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12873672556769974, + "kl": 0.02862548828125, + "learning_rate": 2.139397093155273e-07, + "loss": 0.0762, + "num_tokens": 2037100029.0, + "reward": 2.3582589626312256, + "reward_std": 0.411371111869812, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12914101779460907, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1050.82373046875, + "completions/mean_terminated_length": 856.7066650390625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7920728783762186, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1333956924941047, + "kl": 0.024993896484375, + "learning_rate": 2.137171669004871e-07, + "loss": 0.1113, + "num_tokens": 2037642190.0, + "reward": 2.42578125, + "reward_std": 0.5111509561538696, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15880775451660156, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1001.2232666015625, + "completions/mean_terminated_length": 813.9052734375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7922859730435245, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11934848126811798, + "kl": 0.026397705078125, + "learning_rate": 2.1349481059225477e-07, + "loss": 0.0924, + "num_tokens": 2038160066.0, + "reward": 2.5753350257873535, + "reward_std": 0.4170945882797241, + "rewards/accuracy_reward/mean": 0.6944444179534912, + "rewards/accuracy_reward/std": 0.46117642521858215, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15448831021785736, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 943.8951416015625, + "completions/mean_terminated_length": 759.8776245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7924990677108306, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11582637769836876, + "kl": 0.02703857421875, + "learning_rate": 2.1327264051388755e-07, + "loss": 0.0414, + "num_tokens": 2038651987.0, + "reward": 2.4972100257873535, + "reward_std": 0.3424782156944275, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.1125321164727211, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 938.4531860351562, + "completions/mean_terminated_length": 786.3832397460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7927121623781365, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14429382426113838, + "kl": 0.0302734375, + "learning_rate": 2.1305065678833948e-07, + "loss": 0.1026, + "num_tokens": 2039144254.0, + "reward": 2.498326063156128, + "reward_std": 0.519925057888031, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.17201544344425201, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 950.4910888671875, + "completions/mean_terminated_length": 729.8123779296875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7929252570454425, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1485087061901842, + "kl": 0.029815673828125, + "learning_rate": 2.1282885953846146e-07, + "loss": 0.0699, + "num_tokens": 2039639386.0, + "reward": 2.5011162757873535, + "reward_std": 0.4704189598560333, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13033421337604523, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1150.671875, + "completions/mean_terminated_length": 886.1416015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7931383517127484, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11780789927320942, + "kl": 0.024871826171875, + "learning_rate": 2.1260724888700154e-07, + "loss": 0.0422, + "num_tokens": 2040238471.0, + "reward": 2.318080425262451, + "reward_std": 0.43737876415252686, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1569942682981491, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 932.40185546875, + "completions/mean_terminated_length": 779.5025024414062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7933514463800544, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13564200236173438, + "kl": 0.0302734375, + "learning_rate": 2.1238582495660437e-07, + "loss": 0.0868, + "num_tokens": 2040719307.0, + "reward": 2.505580425262451, + "reward_std": 0.44408172369003296, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13083133101463318, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1100.0625, + "completions/mean_terminated_length": 868.344482421875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7935645410473603, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11664149734504703, + "kl": 0.025634765625, + "learning_rate": 2.1216458786981057e-07, + "loss": 0.0429, + "num_tokens": 2041283687.0, + "reward": 2.415736675262451, + "reward_std": 0.42259514331817627, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12450840324163437, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1074.841552734375, + "completions/mean_terminated_length": 761.9380493164062, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.7937776357146662, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1391687292354972, + "kl": 0.027191162109375, + "learning_rate": 2.119435377490585e-07, + "loss": 0.0895, + "num_tokens": 2041836048.0, + "reward": 2.338169813156128, + "reward_std": 0.4713340103626251, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17736539244651794, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 927.2879638671875, + "completions/mean_terminated_length": 786.4949951171875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.7939907303819722, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12375926531322826, + "kl": 0.028839111328125, + "learning_rate": 2.117226747166821e-07, + "loss": 0.0934, + "num_tokens": 2042312721.0, + "reward": 2.6339287757873535, + "reward_std": 0.374872624874115, + "rewards/accuracy_reward/mean": 0.7165178656578064, + "rewards/accuracy_reward/std": 0.4511922299861908, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11731318384408951, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 955.4866333007812, + "completions/mean_terminated_length": 773.4010620117188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7942038250492781, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12880166313406163, + "kl": 0.027557373046875, + "learning_rate": 2.115019988949126e-07, + "loss": 0.0892, + "num_tokens": 2042814059.0, + "reward": 2.4737725257873535, + "reward_std": 0.48032593727111816, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1400342434644699, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1129.1429443359375, + "completions/mean_terminated_length": 851.3488159179688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7944169197165841, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1245878336278141, + "kl": 0.02484130859375, + "learning_rate": 2.1128151040587656e-07, + "loss": 0.0961, + "num_tokens": 2043390811.0, + "reward": 2.2818081378936768, + "reward_std": 0.4619499146938324, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14585080742835999, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1053.5379638671875, + "completions/mean_terminated_length": 785.906494140625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.79463001438389, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12094039107319492, + "kl": 0.02728271484375, + "learning_rate": 2.1106120937159802e-07, + "loss": 0.0882, + "num_tokens": 2043930764.0, + "reward": 2.4034600257873535, + "reward_std": 0.41054195165634155, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.11921197921037674, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1106.4375, + "completions/mean_terminated_length": 885.96142578125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.794843109051196, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1215029299627093, + "kl": 0.0257568359375, + "learning_rate": 2.1084109591399657e-07, + "loss": 0.0593, + "num_tokens": 2044493664.0, + "reward": 2.4676339626312256, + "reward_std": 0.5396391749382019, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822286784648895, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15520283579826355, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1151.60498046875, + "completions/mean_terminated_length": 890.6945190429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7950562037185019, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11133902821550544, + "kl": 0.0233154296875, + "learning_rate": 2.1062117015488807e-07, + "loss": 0.0571, + "num_tokens": 2045081695.0, + "reward": 2.345982313156128, + "reward_std": 0.4985843002796173, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1540280133485794, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1099.779052734375, + "completions/mean_terminated_length": 841.17333984375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.7952692983858078, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12075919170782645, + "kl": 0.0235595703125, + "learning_rate": 2.104014322159847e-07, + "loss": 0.082, + "num_tokens": 2045641052.0, + "reward": 2.2840402126312256, + "reward_std": 0.426430881023407, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15371057391166687, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1097.732177734375, + "completions/mean_terminated_length": 852.1572875976562, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.7954823930531139, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12949899826555375, + "kl": 0.024505615234375, + "learning_rate": 2.1018188221889437e-07, + "loss": 0.0997, + "num_tokens": 2046207460.0, + "reward": 2.4609375, + "reward_std": 0.4859558641910553, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15964375436306, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 974.10498046875, + "completions/mean_terminated_length": 761.6229858398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7956954877204198, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15092354572097386, + "kl": 0.028076171875, + "learning_rate": 2.0996252028512163e-07, + "loss": 0.1397, + "num_tokens": 2046712707.0, + "reward": 2.4447546005249023, + "reward_std": 0.47263821959495544, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 919.8638916015625, + "completions/mean_terminated_length": 752.8590087890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7959085823877258, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1488288799189458, + "kl": 0.0284423828125, + "learning_rate": 2.0974334653606617e-07, + "loss": 0.0818, + "num_tokens": 2047199990.0, + "reward": 2.4893975257873535, + "reward_std": 0.4314475953578949, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11517468839883804, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 937.6607666015625, + "completions/mean_terminated_length": 752.6041870117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7961216770550317, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13175590074076265, + "kl": 0.029266357421875, + "learning_rate": 2.095243610930238e-07, + "loss": 0.1074, + "num_tokens": 2047690414.0, + "reward": 2.4291296005249023, + "reward_std": 0.42258235812187195, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13850137591362, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 903.51123046875, + "completions/mean_terminated_length": 723.1137084960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7963347717223377, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13573423016954303, + "kl": 0.0311279296875, + "learning_rate": 2.0930556407718658e-07, + "loss": 0.0614, + "num_tokens": 2048154563.0, + "reward": 2.3934152126312256, + "reward_std": 0.4178394079208374, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16459043323993683, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 939.9576416015625, + "completions/mean_terminated_length": 758.6415405273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7965478663896436, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13468923267631736, + "kl": 0.030731201171875, + "learning_rate": 2.090869556096417e-07, + "loss": 0.0627, + "num_tokens": 2048641936.0, + "reward": 2.4285714626312256, + "reward_std": 0.419168621301651, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10152244567871094, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 923.0469360351562, + "completions/mean_terminated_length": 762.3392944335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7967609610569496, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.18708771396666835, + "kl": 0.03289794921875, + "learning_rate": 2.0886853581137214e-07, + "loss": 0.071, + "num_tokens": 2049122517.0, + "reward": 2.388392925262451, + "reward_std": 0.4342552125453949, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14040927588939667, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1201.9576416015625, + "completions/mean_terminated_length": 902.9033203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7969740557242555, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12833860171497077, + "kl": 0.02337646484375, + "learning_rate": 2.0865030480325653e-07, + "loss": 0.08, + "num_tokens": 2049737938.0, + "reward": 2.244419813156128, + "reward_std": 0.49510660767555237, + "rewards/accuracy_reward/mean": 0.3683035671710968, + "rewards/accuracy_reward/std": 0.4828835725784302, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17233900725841522, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1019.1160888671875, + "completions/mean_terminated_length": 760.4580688476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7971871503915614, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11926554649229412, + "kl": 0.026336669921875, + "learning_rate": 2.084322627060693e-07, + "loss": 0.0597, + "num_tokens": 2050260918.0, + "reward": 2.361607313156128, + "reward_std": 0.37909311056137085, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10947446525096893, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 997.5000610351562, + "completions/mean_terminated_length": 714.7875366210938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7974002450588674, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 1.8659877269190053, + "kl": 0.11663818359375, + "learning_rate": 2.0821440964047993e-07, + "loss": 0.0736, + "num_tokens": 2050792182.0, + "reward": 2.3392858505249023, + "reward_std": 0.4276411235332489, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1061.634033203125, + "completions/mean_terminated_length": 785.451416015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7976133397261733, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12145760039104567, + "kl": 0.025238037109375, + "learning_rate": 2.0799674572705317e-07, + "loss": 0.0867, + "num_tokens": 2051344898.0, + "reward": 2.3208706378936768, + "reward_std": 0.3850502371788025, + "rewards/accuracy_reward/mean": 0.4399038553237915, + "rewards/accuracy_reward/std": 0.4969730079174042, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12803789973258972, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1027.8125, + "completions/mean_terminated_length": 767.7647094726562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7978264343934793, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1316396287913759, + "kl": 0.027587890625, + "learning_rate": 2.0777927108624966e-07, + "loss": 0.073, + "num_tokens": 2051871374.0, + "reward": 2.3816964626312256, + "reward_std": 0.4216080904006958, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11531686037778854, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1151.7545166015625, + "completions/mean_terminated_length": 867.064697265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7980395290607852, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10326426520344567, + "kl": 0.022308349609375, + "learning_rate": 2.0756198583842478e-07, + "loss": 0.0446, + "num_tokens": 2052459264.0, + "reward": 2.3136162757873535, + "reward_std": 0.3580675423145294, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09605696052312851, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1020.2589721679688, + "completions/mean_terminated_length": 816.9091186523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7982526237280912, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14498698957856446, + "kl": 0.026092529296875, + "learning_rate": 2.0734489010382938e-07, + "loss": 0.0663, + "num_tokens": 2052986660.0, + "reward": 2.334263563156128, + "reward_std": 0.4096223711967468, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16142748296260834, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1150.78125, + "completions/mean_terminated_length": 833.637451171875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7984657183953972, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13706398143237442, + "kl": 0.023193359375, + "learning_rate": 2.0712798400260933e-07, + "loss": 0.086, + "num_tokens": 2053570770.0, + "reward": 2.299107313156128, + "reward_std": 0.4727000594139099, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 944.5201416015625, + "completions/mean_terminated_length": 693.591796875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.798678813062703, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1473395238650571, + "kl": 0.029083251953125, + "learning_rate": 2.069112676548053e-07, + "loss": 0.1053, + "num_tokens": 2054066987.0, + "reward": 2.4559152126312256, + "reward_std": 0.45777302980422974, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14869897067546844, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 862.0379638671875, + "completions/mean_terminated_length": 732.873779296875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7988919077300091, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15094615359444893, + "kl": 0.032379150390625, + "learning_rate": 2.0669474118035362e-07, + "loss": 0.1065, + "num_tokens": 2054519020.0, + "reward": 2.5814733505249023, + "reward_std": 0.39167726039886475, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11828288435935974, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 905.6183471679688, + "completions/mean_terminated_length": 781.2005004882812, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.799105002397315, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12552220695252464, + "kl": 0.028900146484375, + "learning_rate": 2.064784046990849e-07, + "loss": 0.0725, + "num_tokens": 2054987409.0, + "reward": 2.5200893878936768, + "reward_std": 0.4024791121482849, + "rewards/accuracy_reward/mean": 0.6087962985038757, + "rewards/accuracy_reward/std": 0.4885856807231903, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11409792304039001, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 979.8147583007812, + "completions/mean_terminated_length": 778.64453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.799318097064621, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13602199327134673, + "kl": 0.027252197265625, + "learning_rate": 2.0626225833072487e-07, + "loss": 0.116, + "num_tokens": 2055498014.0, + "reward": 2.3482143878936768, + "reward_std": 0.4339216947555542, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634626507759094, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 909.357177734375, + "completions/mean_terminated_length": 779.0646362304688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7995311917319269, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12998688527229676, + "kl": 0.029022216796875, + "learning_rate": 2.0604630219489379e-07, + "loss": 0.0567, + "num_tokens": 2055974526.0, + "reward": 2.5083706378936768, + "reward_std": 0.32721710205078125, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09830980002880096, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 902.3170166015625, + "completions/mean_terminated_length": 735.2992553710938, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7997442863992329, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13104773340760534, + "kl": 0.02886962890625, + "learning_rate": 2.0583053641110736e-07, + "loss": 0.1113, + "num_tokens": 2056448396.0, + "reward": 2.4966518878936768, + "reward_std": 0.4584699869155884, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1464626044034958, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 963.2366333007812, + "completions/mean_terminated_length": 745.1206665039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7999573810665388, + "frac_reward_zero_std": 0.3214285969734192, + "grad_norm": 0.11731660996010555, + "kl": 0.029205322265625, + "learning_rate": 2.0561496109877492e-07, + "loss": 0.0644, + "num_tokens": 2056943782.0, + "reward": 2.46484375, + "reward_std": 0.279685914516449, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.19128035008907318, + "rewards/tag_count_reward/mean": 0.9871651530265808, + "rewards/tag_count_reward/std": 0.08991296589374542, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 859.9710083007812, + "completions/mean_terminated_length": 693.7073974609375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8001704757338448, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14311185883933064, + "kl": 0.032501220703125, + "learning_rate": 2.0539957637720095e-07, + "loss": 0.0686, + "num_tokens": 2057395673.0, + "reward": 2.532924175262451, + "reward_std": 0.42804330587387085, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11614610999822617, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1106.4285888671875, + "completions/mean_terminated_length": 788.8239135742188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8003835704011507, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12064399780381382, + "kl": 0.0279541015625, + "learning_rate": 2.0518438236558466e-07, + "loss": 0.0731, + "num_tokens": 2057966681.0, + "reward": 2.3660714626312256, + "reward_std": 0.44057387113571167, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15527118742465973, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 997.5647583007812, + "completions/mean_terminated_length": 744.4127197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8005966650684566, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11527335610857593, + "kl": 0.0279541015625, + "learning_rate": 2.049693791830194e-07, + "loss": 0.0543, + "num_tokens": 2058477830.0, + "reward": 2.4520089626312256, + "reward_std": 0.4004580080509186, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13351379334926605, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1049.024658203125, + "completions/mean_terminated_length": 769.3114013671875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8008097597357626, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12610399470655284, + "kl": 0.02801513671875, + "learning_rate": 2.047545669484929e-07, + "loss": 0.0915, + "num_tokens": 2059018769.0, + "reward": 2.4095983505249023, + "reward_std": 0.42340442538261414, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9497767686843872, + "rewards/tag_count_reward/std": 0.18543121218681335, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 981.9063110351562, + "completions/mean_terminated_length": 735.8846435546875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8010228544030685, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1179028770441931, + "kl": 0.02679443359375, + "learning_rate": 2.045399457808873e-07, + "loss": 0.0703, + "num_tokens": 2059528023.0, + "reward": 2.3705358505249023, + "reward_std": 0.3360064923763275, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11967317014932632, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1050.5513916015625, + "completions/mean_terminated_length": 789.2478637695312, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8012359490703745, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1225193369370279, + "kl": 0.027923583984375, + "learning_rate": 2.0432551579897888e-07, + "loss": 0.1017, + "num_tokens": 2060066718.0, + "reward": 2.4135046005249023, + "reward_std": 0.4386962950229645, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.1758127510547638, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1077.790283203125, + "completions/mean_terminated_length": 843.9722900390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8014490437376804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.128495007324455, + "kl": 0.024017333984375, + "learning_rate": 2.041112771214386e-07, + "loss": 0.084, + "num_tokens": 2060621456.0, + "reward": 2.4268975257873535, + "reward_std": 0.48979952931404114, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15371057391166687, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1129.2098388671875, + "completions/mean_terminated_length": 837.3588256835938, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8016621384049865, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11432182495592118, + "kl": 0.023193359375, + "learning_rate": 2.0389722986683062e-07, + "loss": 0.074, + "num_tokens": 2061196302.0, + "reward": 2.3800225257873535, + "reward_std": 0.3595236539840698, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10854540765285492, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 905.8214721679688, + "completions/mean_terminated_length": 742.653076171875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8018752330722924, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13336266037123845, + "kl": 0.030029296875, + "learning_rate": 2.0368337415361413e-07, + "loss": 0.0338, + "num_tokens": 2061666654.0, + "reward": 2.5245537757873535, + "reward_std": 0.3657456934452057, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11119430512189865, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 968.5938110351562, + "completions/mean_terminated_length": 751.5549926757812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8020883277395983, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13204619205709126, + "kl": 0.030609130859375, + "learning_rate": 2.0346971010014178e-07, + "loss": 0.0498, + "num_tokens": 2062170936.0, + "reward": 2.5, + "reward_std": 0.45575597882270813, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13422717154026031, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1094.274658203125, + "completions/mean_terminated_length": 780.1394653320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8023014224069043, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1341447193300278, + "kl": 0.025146484375, + "learning_rate": 2.0325623782466026e-07, + "loss": 0.0405, + "num_tokens": 2062735555.0, + "reward": 2.33984375, + "reward_std": 0.42057469487190247, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.151995450258255, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1005.622802734375, + "completions/mean_terminated_length": 757.9862060546875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8025145170742102, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11478078446376976, + "kl": 0.025482177734375, + "learning_rate": 2.0304295744531013e-07, + "loss": 0.0367, + "num_tokens": 2063259530.0, + "reward": 2.3900671005249023, + "reward_std": 0.369486540555954, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1377149522304535, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 963.0647583007812, + "completions/mean_terminated_length": 785.5298461914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8027276117415162, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14583690282262873, + "kl": 0.030059814453125, + "learning_rate": 2.0282986908012566e-07, + "loss": 0.0948, + "num_tokens": 2063761879.0, + "reward": 2.4536831378936768, + "reward_std": 0.3953183591365814, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11638233810663223, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 957.21435546875, + "completions/mean_terminated_length": 716.4686279296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8029407064088221, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12664396139153772, + "kl": 0.029876708984375, + "learning_rate": 2.0261697284703512e-07, + "loss": 0.0497, + "num_tokens": 2064258215.0, + "reward": 2.41796875, + "reward_std": 0.41329115629196167, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14299829304218292, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 890.4375610351562, + "completions/mean_terminated_length": 770.6896362304688, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8031538010761281, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14014581603126045, + "kl": 0.03125, + "learning_rate": 2.0240426886386025e-07, + "loss": 0.1194, + "num_tokens": 2064725467.0, + "reward": 2.5161831378936768, + "reward_std": 0.4407576322555542, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151519536972046, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1073.1942138671875, + "completions/mean_terminated_length": 807.3380737304688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.803366895743434, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12499879519792503, + "kl": 0.026611328125, + "learning_rate": 2.0219175724831637e-07, + "loss": 0.1464, + "num_tokens": 2065278130.0, + "reward": 2.4347100257873535, + "reward_std": 0.4788230061531067, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13826683163642883, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 886.6027221679688, + "completions/mean_terminated_length": 678.7737426757812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.80357999041074, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1354053015535307, + "kl": 0.03265380859375, + "learning_rate": 2.0197943811801237e-07, + "loss": 0.0521, + "num_tokens": 2065739056.0, + "reward": 2.5301339626312256, + "reward_std": 0.45878246426582336, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15822990238666534, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 966.8192138671875, + "completions/mean_terminated_length": 756.3493041992188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8037930850780459, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12846976093896617, + "kl": 0.0279541015625, + "learning_rate": 2.0176731159045073e-07, + "loss": 0.0171, + "num_tokens": 2066242223.0, + "reward": 2.4614956378936768, + "reward_std": 0.4015711545944214, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11582320928573608, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 855.6339721679688, + "completions/mean_terminated_length": 692.2131958007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8040061797453518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15037841024694723, + "kl": 0.034423828125, + "learning_rate": 2.015553777830271e-07, + "loss": 0.1134, + "num_tokens": 2066687947.0, + "reward": 2.548549175262451, + "reward_std": 0.48142457008361816, + "rewards/accuracy_reward/mean": 0.6921296119689941, + "rewards/accuracy_reward/std": 0.4621478021144867, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15667887032032013, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 961.04248046875, + "completions/mean_terminated_length": 773.2434692382812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8042192744126578, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14416007855740529, + "kl": 0.027984619140625, + "learning_rate": 2.0134363681303065e-07, + "loss": 0.0369, + "num_tokens": 2067183550.0, + "reward": 2.4815850257873535, + "reward_std": 0.4316096603870392, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14275363087654114, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 841.3170166015625, + "completions/mean_terminated_length": 748.4952392578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8044323690799637, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13318427695070756, + "kl": 0.03411865234375, + "learning_rate": 2.0113208879764394e-07, + "loss": 0.0404, + "num_tokens": 2067620428.0, + "reward": 2.654017925262451, + "reward_std": 0.3979220688343048, + "rewards/accuracy_reward/mean": 0.7321428656578064, + "rewards/accuracy_reward/std": 0.4433377683162689, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11286582797765732, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 897.1563110351562, + "completions/mean_terminated_length": 729.3861694335938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8046454637472698, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13125331870450319, + "kl": 0.031707763671875, + "learning_rate": 2.0092073385394266e-07, + "loss": 0.0827, + "num_tokens": 2068087010.0, + "reward": 2.544642925262451, + "reward_std": 0.43339887261390686, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14383502304553986, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1071.290283203125, + "completions/mean_terminated_length": 829.1531982421875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8048585584145757, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12188286150869962, + "kl": 0.026397705078125, + "learning_rate": 2.007095720988952e-07, + "loss": 0.0879, + "num_tokens": 2068637348.0, + "reward": 2.4888393878936768, + "reward_std": 0.40402692556381226, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09910815209150314, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1008.0156860351562, + "completions/mean_terminated_length": 760.9475708007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8050716530818817, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1325395845940211, + "kl": 0.029388427734375, + "learning_rate": 2.004986036493638e-07, + "loss": 0.0633, + "num_tokens": 2069155739.0, + "reward": 2.552455425262451, + "reward_std": 0.4079567492008209, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969305515289307, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1302192211151123, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 984.6920166015625, + "completions/mean_terminated_length": 774.3048095703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8052847477491876, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13944983256053214, + "kl": 0.030731201171875, + "learning_rate": 2.0028782862210312e-07, + "loss": 0.0649, + "num_tokens": 2069666657.0, + "reward": 2.4073662757873535, + "reward_std": 0.44240111112594604, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1253739446401596, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 899.700927734375, + "completions/mean_terminated_length": 732.3017578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8054978424164936, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1402763108155338, + "kl": 0.0330810546875, + "learning_rate": 2.0007724713376135e-07, + "loss": 0.0752, + "num_tokens": 2070135387.0, + "reward": 2.5652902126312256, + "reward_std": 0.3437145948410034, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11682131141424179, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1046.575927734375, + "completions/mean_terminated_length": 743.8197631835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8057109370837995, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12625194361251593, + "kl": 0.031646728515625, + "learning_rate": 1.9986685930087872e-07, + "loss": 0.0547, + "num_tokens": 2070676941.0, + "reward": 2.325892925262451, + "reward_std": 0.5002307295799255, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.18284475803375244, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1056.372802734375, + "completions/mean_terminated_length": 817.393310546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8059240317511054, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10443118761932445, + "kl": 0.028076171875, + "learning_rate": 1.996566652398892e-07, + "loss": 0.0456, + "num_tokens": 2071212724.0, + "reward": 2.4012277126312256, + "reward_std": 0.38326969742774963, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692909002304, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 963.638427734375, + "completions/mean_terminated_length": 824.3375244140625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8061371264184114, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13064643936662915, + "kl": 0.02838134765625, + "learning_rate": 1.99446665067119e-07, + "loss": 0.0824, + "num_tokens": 2071714354.0, + "reward": 2.4447546005249023, + "reward_std": 0.4725989103317261, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1514935940504074, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1044.171875, + "completions/mean_terminated_length": 842.3297729492188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8063502210857173, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.17672283076157, + "kl": 0.030242919921875, + "learning_rate": 1.9923685889878707e-07, + "loss": 0.0736, + "num_tokens": 2072258447.0, + "reward": 2.4302456378936768, + "reward_std": 0.457394003868103, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 992.1942138671875, + "completions/mean_terminated_length": 816.2265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8065633157530233, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14480298592758234, + "kl": 0.028350830078125, + "learning_rate": 1.9902724685100513e-07, + "loss": 0.1274, + "num_tokens": 2072772726.0, + "reward": 2.392857313156128, + "reward_std": 0.49510669708251953, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1145.078125, + "completions/mean_terminated_length": 875.5101928710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8067764104203292, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10775098600232735, + "kl": 0.022796630859375, + "learning_rate": 1.988178290397773e-07, + "loss": 0.057, + "num_tokens": 2073352617.0, + "reward": 2.3973214626312256, + "reward_std": 0.40405386686325073, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11531686037778854, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1096.6629638671875, + "completions/mean_terminated_length": 826.7994384765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8069895050876352, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12103206805729559, + "kl": 0.026824951171875, + "learning_rate": 1.9860860558100057e-07, + "loss": 0.1071, + "num_tokens": 2073912722.0, + "reward": 2.328125, + "reward_std": 0.3918082118034363, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13058780133724213, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 965.99560546875, + "completions/mean_terminated_length": 741.4285278320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8072025997549411, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13517550430363887, + "kl": 0.02764892578125, + "learning_rate": 1.9839957659046386e-07, + "loss": 0.0859, + "num_tokens": 2074418560.0, + "reward": 2.513392925262451, + "reward_std": 0.40774139761924744, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10688954591751099, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1029.2054443359375, + "completions/mean_terminated_length": 790.6446533203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.807415694422247, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12544393496048453, + "kl": 0.027374267578125, + "learning_rate": 1.9819074218384865e-07, + "loss": 0.0619, + "num_tokens": 2074940732.0, + "reward": 2.4268975257873535, + "reward_std": 0.4089321196079254, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16824373602867126, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1122.4375, + "completions/mean_terminated_length": 810.2328491210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.807628789089553, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.134705934958554, + "kl": 0.02264404296875, + "learning_rate": 1.9798210247672907e-07, + "loss": 0.0937, + "num_tokens": 2075513728.0, + "reward": 2.3699777126312256, + "reward_std": 0.48972755670547485, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549686908722, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1475696861743927, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 961.1942138671875, + "completions/mean_terminated_length": 756.5172119140625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.807841883756859, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1309955525681121, + "kl": 0.03009033203125, + "learning_rate": 1.977736575845711e-07, + "loss": 0.0251, + "num_tokens": 2076013079.0, + "reward": 2.4486608505249023, + "reward_std": 0.43551793694496155, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13840332627296448, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 956.6004638671875, + "completions/mean_terminated_length": 733.6263427734375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.808054978424165, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13587869737708755, + "kl": 0.029571533203125, + "learning_rate": 1.97565407622733e-07, + "loss": 0.0704, + "num_tokens": 2076513412.0, + "reward": 2.435267925262451, + "reward_std": 0.30990543961524963, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08213625103235245, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1082.7679443359375, + "completions/mean_terminated_length": 826.4632568359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8082680730914709, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1440912242219125, + "kl": 0.02459716796875, + "learning_rate": 1.973573527064651e-07, + "loss": 0.0708, + "num_tokens": 2077073244.0, + "reward": 2.341517925262451, + "reward_std": 0.4613869786262512, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1826944798231125, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1040.337158203125, + "completions/mean_terminated_length": 837.723876953125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8084811677587769, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1355983875506404, + "kl": 0.0264892578125, + "learning_rate": 1.971494929509101e-07, + "loss": 0.0832, + "num_tokens": 2077613507.0, + "reward": 2.4341518878936768, + "reward_std": 0.4743957817554474, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1401556432247162, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1027.59375, + "completions/mean_terminated_length": 798.9781494140625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8086942624260828, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13451323027148585, + "kl": 0.0263671875, + "learning_rate": 1.9694182847110247e-07, + "loss": 0.1089, + "num_tokens": 2078144109.0, + "reward": 2.439732313156128, + "reward_std": 0.4938867688179016, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16231535375118256, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 885.0156860351562, + "completions/mean_terminated_length": 722.2570190429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8089073570933888, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14898892363526484, + "kl": 0.034088134765625, + "learning_rate": 1.9673435938196826e-07, + "loss": 0.0503, + "num_tokens": 2078614548.0, + "reward": 2.5552456378936768, + "reward_std": 0.3411842882633209, + "rewards/accuracy_reward/mean": 0.6620370149612427, + "rewards/accuracy_reward/std": 0.4735642969608307, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1066.857177734375, + "completions/mean_terminated_length": 833.7680053710938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8091204517606947, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12697935977043914, + "kl": 0.027069091796875, + "learning_rate": 1.9652708579832605e-07, + "loss": 0.0796, + "num_tokens": 2079159972.0, + "reward": 2.35546875, + "reward_std": 0.472360223531723, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1094.171875, + "completions/mean_terminated_length": 874.0577392578125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8093335464280006, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1283896540380711, + "kl": 0.024658203125, + "learning_rate": 1.963200078348858e-07, + "loss": 0.1107, + "num_tokens": 2079719041.0, + "reward": 2.4017858505249023, + "reward_std": 0.4016064405441284, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14187753200531006, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 948.2120971679688, + "completions/mean_terminated_length": 761.56396484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8095466410953066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12906811895578327, + "kl": 0.029541015625, + "learning_rate": 1.961131256062494e-07, + "loss": 0.0794, + "num_tokens": 2080214432.0, + "reward": 2.4760046005249023, + "reward_std": 0.46407073736190796, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15764793753623962, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1190.555908203125, + "completions/mean_terminated_length": 931.3284912109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8097597357626125, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1190227112544496, + "kl": 0.023834228515625, + "learning_rate": 1.9590643922691013e-07, + "loss": 0.0928, + "num_tokens": 2080821673.0, + "reward": 2.4029018878936768, + "reward_std": 0.5205156207084656, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.17233900725841522, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1079.977783203125, + "completions/mean_terminated_length": 794.60693359375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8099728304299185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12316714583750477, + "kl": 0.025665283203125, + "learning_rate": 1.9569994881125317e-07, + "loss": 0.0857, + "num_tokens": 2081381087.0, + "reward": 2.3911831378936768, + "reward_std": 0.5174334049224854, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15764793753623962, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1106.078125, + "completions/mean_terminated_length": 824.86669921875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8101859250972244, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.09862052381771616, + "kl": 0.023284912109375, + "learning_rate": 1.9549365447355527e-07, + "loss": 0.0315, + "num_tokens": 2081945298.0, + "reward": 2.392857313156128, + "reward_std": 0.3353942334651947, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13106489181518555, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1008.7210083007812, + "completions/mean_terminated_length": 758.257568359375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8103990197645304, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13738314807664986, + "kl": 0.028472900390625, + "learning_rate": 1.9528755632798444e-07, + "loss": 0.0859, + "num_tokens": 2082464933.0, + "reward": 2.4268975257873535, + "reward_std": 0.48533666133880615, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.17139743268489838, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 995.9085083007812, + "completions/mean_terminated_length": 745.964111328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8106121144318363, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12508686190014665, + "kl": 0.026458740234375, + "learning_rate": 1.9508165448860025e-07, + "loss": 0.0544, + "num_tokens": 2082980460.0, + "reward": 2.4681921005249023, + "reward_std": 0.38320353627204895, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1226801648736, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 958.0402221679688, + "completions/mean_terminated_length": 776.3802490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8108252090991422, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1465439941462836, + "kl": 0.032501220703125, + "learning_rate": 1.9487594906935355e-07, + "loss": 0.0513, + "num_tokens": 2083486222.0, + "reward": 2.4034600257873535, + "reward_std": 0.4473932981491089, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13311463594436646, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 1091.091552734375, + "completions/mean_terminated_length": 801.7935791015625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8110383037664483, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.35775991035324856, + "kl": 0.02960205078125, + "learning_rate": 1.9467044018408685e-07, + "loss": 0.0864, + "num_tokens": 2084044039.0, + "reward": 2.248326063156128, + "reward_std": 0.46287810802459717, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9469866156578064, + "rewards/tag_count_reward/std": 0.18043838441371918, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 979.4442138671875, + "completions/mean_terminated_length": 781.5634765625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8112513984337542, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13152068794394575, + "kl": 0.02734375, + "learning_rate": 1.9446512794653322e-07, + "loss": 0.1027, + "num_tokens": 2084550174.0, + "reward": 2.44140625, + "reward_std": 0.4648064970970154, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14801737666130066, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1107.6295166015625, + "completions/mean_terminated_length": 861.2788696289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8114644931010602, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11873667335394705, + "kl": 0.024810791015625, + "learning_rate": 1.9426001247031725e-07, + "loss": 0.0909, + "num_tokens": 2085114008.0, + "reward": 2.353236675262451, + "reward_std": 0.4628064036369324, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15332838892936707, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1025.7076416015625, + "completions/mean_terminated_length": 800.0789794921875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8116775877683661, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1253862565931331, + "kl": 0.026947021484375, + "learning_rate": 1.9405509386895492e-07, + "loss": 0.0434, + "num_tokens": 2085648197.0, + "reward": 2.458705425262451, + "reward_std": 0.3712116777896881, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.10495071113109589, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1022.1339721679688, + "completions/mean_terminated_length": 828.9336547851562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8118906824356721, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13963517370526465, + "kl": 0.0286865234375, + "learning_rate": 1.9385037225585292e-07, + "loss": 0.1263, + "num_tokens": 2086175553.0, + "reward": 2.5318081378936768, + "reward_std": 0.4854374825954437, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.17038200795650482, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1065.399658203125, + "completions/mean_terminated_length": 821.8021850585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.812103777102978, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12967905736453128, + "kl": 0.02593994140625, + "learning_rate": 1.9364584774430893e-07, + "loss": 0.0785, + "num_tokens": 2086721668.0, + "reward": 2.4765625, + "reward_std": 0.403864324092865, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316954612732, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11589459329843521, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 840.1473388671875, + "completions/mean_terminated_length": 638.8385620117188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.812316871770284, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1346870635567144, + "kl": 0.031005859375, + "learning_rate": 1.9344152044751162e-07, + "loss": 0.0816, + "num_tokens": 2087164102.0, + "reward": 2.5630581378936768, + "reward_std": 0.41586530208587646, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13381615281105042, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 1018.7745971679688, + "completions/mean_terminated_length": 811.8257446289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8125299664375899, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13519956505613137, + "kl": 0.0267333984375, + "learning_rate": 1.932373904785404e-07, + "loss": 0.043, + "num_tokens": 2087688817.0, + "reward": 2.3387277126312256, + "reward_std": 0.40027564764022827, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1293378323316574, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1010.2053833007812, + "completions/mean_terminated_length": 781.1552734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8127430611048958, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12082378955727528, + "kl": 0.026824951171875, + "learning_rate": 1.9303345795036596e-07, + "loss": 0.0559, + "num_tokens": 2088212653.0, + "reward": 2.3286831378936768, + "reward_std": 0.3728073835372925, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14963631331920624, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1005.8772583007812, + "completions/mean_terminated_length": 816.1504516601562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8129561557722018, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13111701838697298, + "kl": 0.028076171875, + "learning_rate": 1.9282972297584883e-07, + "loss": 0.0779, + "num_tokens": 2088735606.0, + "reward": 2.502232313156128, + "reward_std": 0.4124475419521332, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09438535571098328, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 989.1027221679688, + "completions/mean_terminated_length": 815.8285522460938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8131692504395077, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1265401464176991, + "kl": 0.02935791015625, + "learning_rate": 1.926261856677411e-07, + "loss": 0.0619, + "num_tokens": 2089244068.0, + "reward": 2.357701063156128, + "reward_std": 0.37576520442962646, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11440251022577286, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1001.5803833007812, + "completions/mean_terminated_length": 852.091796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8133823451068137, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12600557532524928, + "kl": 0.028045654296875, + "learning_rate": 1.9242284613868492e-07, + "loss": 0.0532, + "num_tokens": 2089768232.0, + "reward": 2.4520089626312256, + "reward_std": 0.4262053668498993, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12473504990339279, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 904.3817138671875, + "completions/mean_terminated_length": 776.682373046875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8135954397741196, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13995609731510578, + "kl": 0.030792236328125, + "learning_rate": 1.9221970450121324e-07, + "loss": 0.095, + "num_tokens": 2090248515.0, + "reward": 2.52734375, + "reward_std": 0.4384230375289917, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.15014436841011047, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 832.2745971679688, + "completions/mean_terminated_length": 689.7830810546875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8138085344414256, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13702210624351652, + "kl": 0.031768798828125, + "learning_rate": 1.920167608677494e-07, + "loss": 0.0594, + "num_tokens": 2090690734.0, + "reward": 2.625, + "reward_std": 0.35233721137046814, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.45739173889160156, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10517053306102753, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 882.68310546875, + "completions/mean_terminated_length": 746.0997924804688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8140216291087315, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1420290558236536, + "kl": 0.02996826171875, + "learning_rate": 1.91814015350607e-07, + "loss": 0.0699, + "num_tokens": 2091156784.0, + "reward": 2.4575893878936768, + "reward_std": 0.46678900718688965, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.14942027628421783, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 992.4553833007812, + "completions/mean_terminated_length": 793.665771484375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8142347237760375, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12768523191439335, + "kl": 0.028717041015625, + "learning_rate": 1.916114680619904e-07, + "loss": 0.0663, + "num_tokens": 2091669836.0, + "reward": 2.4581475257873535, + "reward_std": 0.47829800844192505, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.49405214190483093, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1116.87060546875, + "completions/mean_terminated_length": 866.2832641601562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8144478184433435, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.112577832189833, + "kl": 0.0240478515625, + "learning_rate": 1.9140911911399393e-07, + "loss": 0.0324, + "num_tokens": 2092244834.0, + "reward": 2.416294813156128, + "reward_std": 0.36861851811408997, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12405261397361755, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1042.1429443359375, + "completions/mean_terminated_length": 823.478271484375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8146609131106494, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12958577892049164, + "kl": 0.027679443359375, + "learning_rate": 1.9120696861860226e-07, + "loss": 0.0466, + "num_tokens": 2092777106.0, + "reward": 2.334263563156128, + "reward_std": 0.429955393075943, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 944.2120971679688, + "completions/mean_terminated_length": 711.5216674804688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8148740077779554, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15337722624225922, + "kl": 0.030548095703125, + "learning_rate": 1.910050166876902e-07, + "loss": 0.0624, + "num_tokens": 2093262737.0, + "reward": 2.455357313156128, + "reward_std": 0.4916169047355652, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16768723726272583, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 961.2902221679688, + "completions/mean_terminated_length": 821.6876220703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8150871024452613, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1432327153351817, + "kl": 0.0289306640625, + "learning_rate": 1.908032634330227e-07, + "loss": 0.0737, + "num_tokens": 2093765091.0, + "reward": 2.5541296005249023, + "reward_std": 0.41940391063690186, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.11946304142475128, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1093.904052734375, + "completions/mean_terminated_length": 857.3732299804688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8153001971125673, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12096446403567682, + "kl": 0.0247802734375, + "learning_rate": 1.9060170896625466e-07, + "loss": 0.0702, + "num_tokens": 2094329544.0, + "reward": 2.3275671005249023, + "reward_std": 0.4879489541053772, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.1706821471452713, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1091.4420166015625, + "completions/mean_terminated_length": 844.2415771484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8155132917798732, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11362024005693917, + "kl": 0.024017333984375, + "learning_rate": 1.9040035339893102e-07, + "loss": 0.0789, + "num_tokens": 2094887486.0, + "reward": 2.3761162757873535, + "reward_std": 0.3883502185344696, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11987641453742981, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1126.7388916015625, + "completions/mean_terminated_length": 904.7174072265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8157263864471792, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12341640906879822, + "kl": 0.02490234375, + "learning_rate": 1.9019919684248689e-07, + "loss": 0.0844, + "num_tokens": 2095457529.0, + "reward": 2.3465402126312256, + "reward_std": 0.45748406648635864, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822286784648895, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15223345160484314, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 980.8058471679688, + "completions/mean_terminated_length": 809.3911743164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8159394811144851, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13535965508258643, + "kl": 0.028900146484375, + "learning_rate": 1.8999823940824688e-07, + "loss": 0.134, + "num_tokens": 2095962498.0, + "reward": 2.447544813156128, + "reward_std": 0.47021356225013733, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14886364340782166, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 881.7277221679688, + "completions/mean_terminated_length": 731.9042358398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.816152575781791, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1361864352144198, + "kl": 0.034271240234375, + "learning_rate": 1.8979748120742562e-07, + "loss": 0.0689, + "num_tokens": 2096423960.0, + "reward": 2.5597100257873535, + "reward_std": 0.459262877702713, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.4769369065761566, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 952.4464721679688, + "completions/mean_terminated_length": 783.0308837890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.816365670449097, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11655359659526496, + "kl": 0.028778076171875, + "learning_rate": 1.8959692235112735e-07, + "loss": 0.083, + "num_tokens": 2096920000.0, + "reward": 2.4441964626312256, + "reward_std": 0.4263933300971985, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1056.8170166015625, + "completions/mean_terminated_length": 844.6124877929688, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8165787651164029, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1150665711979237, + "kl": 0.025726318359375, + "learning_rate": 1.8939656295034588e-07, + "loss": 0.0365, + "num_tokens": 2097469118.0, + "reward": 2.357142925262451, + "reward_std": 0.4608336091041565, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14238695800304413, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 897.10498046875, + "completions/mean_terminated_length": 725.9461669921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8167918597837089, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1281683491048, + "kl": 0.030914306640625, + "learning_rate": 1.891964031159653e-07, + "loss": 0.0299, + "num_tokens": 2097941821.0, + "reward": 2.5106027126312256, + "reward_std": 0.3279326558113098, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.09791535139083862, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 986.4576416015625, + "completions/mean_terminated_length": 762.6729736328125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8170049544510148, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3264248516741544, + "kl": 0.0328369140625, + "learning_rate": 1.8899644295875815e-07, + "loss": 0.0799, + "num_tokens": 2098454202.0, + "reward": 2.493861675262451, + "reward_std": 0.48603954911231995, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12892211973667145, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 902.325927734375, + "completions/mean_terminated_length": 748.6025390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8172180491183209, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14020973383759344, + "kl": 0.03302001953125, + "learning_rate": 1.887966825893875e-07, + "loss": 0.0596, + "num_tokens": 2098921756.0, + "reward": 2.615513563156128, + "reward_std": 0.40775319933891296, + "rewards/accuracy_reward/mean": 0.7098214030265808, + "rewards/accuracy_reward/std": 0.4543519914150238, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1113.2545166015625, + "completions/mean_terminated_length": 837.693603515625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8174311437856268, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.106747157000383, + "kl": 0.023712158203125, + "learning_rate": 1.8859712211840522e-07, + "loss": 0.0357, + "num_tokens": 2099490926.0, + "reward": 2.369419813156128, + "reward_std": 0.37991073727607727, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1479213982820511, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1094.247802734375, + "completions/mean_terminated_length": 890.0569458007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8176442384529328, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2685312444455439, + "kl": 0.02783203125, + "learning_rate": 1.8839776165625288e-07, + "loss": 0.0968, + "num_tokens": 2100053341.0, + "reward": 2.1746652126312256, + "reward_std": 0.48554307222366333, + "rewards/accuracy_reward/mean": 0.35648149251937866, + "rewards/accuracy_reward/std": 0.47951504588127136, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9380580186843872, + "rewards/tag_count_reward/std": 0.2018849104642868, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1059.265625, + "completions/mean_terminated_length": 807.2352905273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8178573331202387, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12807251681719428, + "kl": 0.027923583984375, + "learning_rate": 1.8819860131326114e-07, + "loss": 0.1079, + "num_tokens": 2100598772.0, + "reward": 2.3197546005249023, + "reward_std": 0.507283091545105, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.14529338479042053, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 916.5960083007812, + "completions/mean_terminated_length": 748.3359375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8180704277875446, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.130171173202459, + "kl": 0.028717041015625, + "learning_rate": 1.8799964119964994e-07, + "loss": 0.0373, + "num_tokens": 2101076655.0, + "reward": 2.53125, + "reward_std": 0.3690985441207886, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11566276848316193, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 861.3214721679688, + "completions/mean_terminated_length": 698.68017578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8182835224548506, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13289740377845502, + "kl": 0.02978515625, + "learning_rate": 1.878008814255287e-07, + "loss": 0.0695, + "num_tokens": 2101534335.0, + "reward": 2.5078125, + "reward_std": 0.3831934630870819, + "rewards/accuracy_reward/mean": 0.6226851940155029, + "rewards/accuracy_reward/std": 0.4852766990661621, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15391045808792114, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 985.1585083007812, + "completions/mean_terminated_length": 761.1000366210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8184966171221565, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1325864013246796, + "kl": 0.02691650390625, + "learning_rate": 1.8760232210089549e-07, + "loss": 0.0748, + "num_tokens": 2102050614.0, + "reward": 2.3777902126312256, + "reward_std": 0.481874018907547, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15941192209720612, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 981.9754638671875, + "completions/mean_terminated_length": 787.8971557617188, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.8187097117894625, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13756733492060325, + "kl": 0.027069091796875, + "learning_rate": 1.8740396333563797e-07, + "loss": 0.0651, + "num_tokens": 2102563339.0, + "reward": 2.4486608505249023, + "reward_std": 0.3990635275840759, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10688954591751099, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1038.0179443359375, + "completions/mean_terminated_length": 828.39892578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8189228064567684, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13069393468803966, + "kl": 0.02618408203125, + "learning_rate": 1.872058052395323e-07, + "loss": 0.0775, + "num_tokens": 2103093475.0, + "reward": 2.439732313156128, + "reward_std": 0.43193283677101135, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 974.6808471679688, + "completions/mean_terminated_length": 799.0467529296875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8191359011240744, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12573697747410711, + "kl": 0.02899169921875, + "learning_rate": 1.8700784792224394e-07, + "loss": 0.0526, + "num_tokens": 2103596308.0, + "reward": 2.3588171005249023, + "reward_std": 0.46852564811706543, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 979.5067138671875, + "completions/mean_terminated_length": 778.2785034179688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8193489957913803, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1338155037707908, + "kl": 0.0306396484375, + "learning_rate": 1.8681009149332708e-07, + "loss": 0.1056, + "num_tokens": 2104101607.0, + "reward": 2.517857313156128, + "reward_std": 0.4564667046070099, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12733516097068787, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1017.8326416015625, + "completions/mean_terminated_length": 776.6088256835938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8195620904586862, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12011659044037523, + "kl": 0.02557373046875, + "learning_rate": 1.8661253606222462e-07, + "loss": 0.1024, + "num_tokens": 2104638236.0, + "reward": 2.372767925262451, + "reward_std": 0.4167993664741516, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16370917856693268, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1125.3460693359375, + "completions/mean_terminated_length": 924.76904296875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8197751851259922, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11385795383098586, + "kl": 0.0235595703125, + "learning_rate": 1.8641518173826858e-07, + "loss": 0.0296, + "num_tokens": 2105209591.0, + "reward": 2.364955425262451, + "reward_std": 0.44534406065940857, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13911856710910797, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1088.52685546875, + "completions/mean_terminated_length": 826.852294921875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8199882797932981, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.124590886487887, + "kl": 0.0242919921875, + "learning_rate": 1.8621802863067959e-07, + "loss": 0.0939, + "num_tokens": 2105775107.0, + "reward": 2.3325893878936768, + "reward_std": 0.4274972379207611, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.15048591792583466, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 937.8035888671875, + "completions/mean_terminated_length": 762.8114013671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8202013744606041, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15348456023486645, + "kl": 0.030029296875, + "learning_rate": 1.8602107684856637e-07, + "loss": 0.1029, + "num_tokens": 2106256779.0, + "reward": 2.5267858505249023, + "reward_std": 0.49287956953048706, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15220165252685547, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 975.6250610351562, + "completions/mean_terminated_length": 812.9768676757812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.82041446912791, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11885162763176045, + "kl": 0.028778076171875, + "learning_rate": 1.8582432650092705e-07, + "loss": 0.0552, + "num_tokens": 2106769155.0, + "reward": 2.40625, + "reward_std": 0.3230324387550354, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005797147750854, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11119429767131805, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 973.0156860351562, + "completions/mean_terminated_length": 777.3060913085938, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8206275637952161, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12714781788741422, + "kl": 0.029937744140625, + "learning_rate": 1.8562777769664758e-07, + "loss": 0.0888, + "num_tokens": 2107279162.0, + "reward": 2.4363839626312256, + "reward_std": 0.4241209328174591, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12247265130281448, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 898.2098388671875, + "completions/mean_terminated_length": 750.5037231445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.820840658462522, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14487028545693348, + "kl": 0.03302001953125, + "learning_rate": 1.8543143054450305e-07, + "loss": 0.0826, + "num_tokens": 2107748344.0, + "reward": 2.4793527126312256, + "reward_std": 0.43265241384506226, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13566918671131134, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 920.79248046875, + "completions/mean_terminated_length": 766.302001953125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.821053753129828, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.2758949112052222, + "kl": 0.029266357421875, + "learning_rate": 1.8523528515315635e-07, + "loss": 0.0425, + "num_tokens": 2108227035.0, + "reward": 2.533482313156128, + "reward_std": 0.41881516575813293, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11923423409461975, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1000.5803833007812, + "completions/mean_terminated_length": 806.6137084960938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8212668477971339, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14576830671774685, + "kl": 0.027130126953125, + "learning_rate": 1.8503934163115875e-07, + "loss": 0.0337, + "num_tokens": 2108737039.0, + "reward": 2.4296875, + "reward_std": 0.47552403807640076, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15116052329540253, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 987.6920166015625, + "completions/mean_terminated_length": 753.6730346679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8214799424644398, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12487412155197368, + "kl": 0.02813720703125, + "learning_rate": 1.8484360008695036e-07, + "loss": 0.0381, + "num_tokens": 2109249397.0, + "reward": 2.45703125, + "reward_std": 0.39589551091194153, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1091761440038681, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1027.29248046875, + "completions/mean_terminated_length": 808.7669677734375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8216930371317458, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1357954579969625, + "kl": 0.02862548828125, + "learning_rate": 1.8464806062885897e-07, + "loss": 0.0881, + "num_tokens": 2109779480.0, + "reward": 2.5223214626312256, + "reward_std": 0.3850313127040863, + "rewards/accuracy_reward/mean": 0.6319444179534912, + "rewards/accuracy_reward/std": 0.48283568024635315, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12449962645769119, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1087.3125, + "completions/mean_terminated_length": 821.8233642578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8219061317990517, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1325053492479136, + "kl": 0.02606201171875, + "learning_rate": 1.844527233651007e-07, + "loss": 0.1443, + "num_tokens": 2110331076.0, + "reward": 2.4252233505249023, + "reward_std": 0.43619370460510254, + "rewards/accuracy_reward/mean": 0.5532407164573669, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1717585027217865, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1008.4910888671875, + "completions/mean_terminated_length": 835.2396240234375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8221192264663577, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13293971863473403, + "kl": 0.027557373046875, + "learning_rate": 1.842575884037797e-07, + "loss": 0.0716, + "num_tokens": 2110853616.0, + "reward": 2.4849331378936768, + "reward_std": 0.385657399892807, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14874933660030365, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1057.607177734375, + "completions/mean_terminated_length": 812.0780029296875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8223323211336636, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13276222701829757, + "kl": 0.029052734375, + "learning_rate": 1.8406265585288856e-07, + "loss": 0.0784, + "num_tokens": 2111397232.0, + "reward": 2.4369421005249023, + "reward_std": 0.48796749114990234, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1579248607158661, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1024.22998046875, + "completions/mean_terminated_length": 748.7110595703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8225454158009696, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13464395208980903, + "kl": 0.0252685546875, + "learning_rate": 1.8386792582030713e-07, + "loss": 0.098, + "num_tokens": 2111925767.0, + "reward": 2.4503350257873535, + "reward_std": 0.48645058274269104, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1114.263427734375, + "completions/mean_terminated_length": 817.6647338867188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8227585104682755, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1118619029596137, + "kl": 0.024658203125, + "learning_rate": 1.8367339841380365e-07, + "loss": 0.0556, + "num_tokens": 2112498557.0, + "reward": 2.4140625, + "reward_std": 0.36657649278640747, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266101151704788, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1176.888427734375, + "completions/mean_terminated_length": 893.3905639648438, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.8229716051355814, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12059987794815479, + "kl": 0.022308349609375, + "learning_rate": 1.834790737410343e-07, + "loss": 0.0481, + "num_tokens": 2113104523.0, + "reward": 2.2628350257873535, + "reward_std": 0.4339313805103302, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15933358669281006, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1030.26123046875, + "completions/mean_terminated_length": 812.3712768554688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8231846998028874, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1394780496726591, + "kl": 0.02801513671875, + "learning_rate": 1.8328495190954294e-07, + "loss": 0.0973, + "num_tokens": 2113633728.0, + "reward": 2.3236608505249023, + "reward_std": 0.49038365483283997, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.1637244075536728, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 930.4375610351562, + "completions/mean_terminated_length": 780.486083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8233977944701933, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13991560733372568, + "kl": 0.02880859375, + "learning_rate": 1.83091033026761e-07, + "loss": 0.1137, + "num_tokens": 2114121252.0, + "reward": 2.5306921005249023, + "reward_std": 0.41868260502815247, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11418405920267105, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 952.4688110351562, + "completions/mean_terminated_length": 753.0184936523438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8236108891374994, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1350422985019993, + "kl": 0.0279541015625, + "learning_rate": 1.8289731720000784e-07, + "loss": 0.1227, + "num_tokens": 2114621286.0, + "reward": 2.4614956378936768, + "reward_std": 0.4467255175113678, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14945264160633087, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1081.0179443359375, + "completions/mean_terminated_length": 827.6957397460938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8238239838048053, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12892943630633458, + "kl": 0.025634765625, + "learning_rate": 1.8270380453649012e-07, + "loss": 0.0497, + "num_tokens": 2115170894.0, + "reward": 2.404017925262451, + "reward_std": 0.4098993241786957, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1300705373287201, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 933.4107666015625, + "completions/mean_terminated_length": 774.1836547851562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8240370784721113, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12439047973757066, + "kl": 0.0284423828125, + "learning_rate": 1.8251049514330268e-07, + "loss": 0.0543, + "num_tokens": 2115653142.0, + "reward": 2.584263563156128, + "reward_std": 0.4098218083381653, + "rewards/accuracy_reward/mean": 0.6696428656578064, + "rewards/accuracy_reward/std": 0.47086748480796814, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11734377592802048, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 927.0000610351562, + "completions/mean_terminated_length": 789.3333129882812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8242501731394172, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13672328599911926, + "kl": 0.029754638671875, + "learning_rate": 1.823173891274271e-07, + "loss": 0.1029, + "num_tokens": 2116144390.0, + "reward": 2.505580425262451, + "reward_std": 0.44610515236854553, + "rewards/accuracy_reward/mean": 0.6412037014961243, + "rewards/accuracy_reward/std": 0.48020341992378235, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1038.5625, + "completions/mean_terminated_length": 838.834228515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8244632678067232, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12181813640928163, + "kl": 0.02777099609375, + "learning_rate": 1.8212448659573298e-07, + "loss": 0.0398, + "num_tokens": 2116685970.0, + "reward": 2.51171875, + "reward_std": 0.4328148365020752, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517839670181274, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1027.40185546875, + "completions/mean_terminated_length": 802.1471557617188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8246763624740291, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1201401857596392, + "kl": 0.02813720703125, + "learning_rate": 1.8193178765497708e-07, + "loss": -0.0019, + "num_tokens": 2117219638.0, + "reward": 2.4520089626312256, + "reward_std": 0.3480514585971832, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11305922269821167, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1140.5, + "completions/mean_terminated_length": 827.09912109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.824889457141335, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10831456889204942, + "kl": 0.0234375, + "learning_rate": 1.8173929241180347e-07, + "loss": 0.0635, + "num_tokens": 2117802742.0, + "reward": 2.4213171005249023, + "reward_std": 0.3452896475791931, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12540756165981293, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1064.90185546875, + "completions/mean_terminated_length": 807.3577270507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.825102551808641, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12876231627715187, + "kl": 0.027099609375, + "learning_rate": 1.8154700097274365e-07, + "loss": 0.1125, + "num_tokens": 2118348618.0, + "reward": 2.421875, + "reward_std": 0.40664952993392944, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11244472116231918, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 993.5313110351562, + "completions/mean_terminated_length": 839.8107299804688, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8253156464759469, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13353373629243176, + "kl": 0.029144287109375, + "learning_rate": 1.8135491344421592e-07, + "loss": 0.0492, + "num_tokens": 2118867896.0, + "reward": 2.4681921005249023, + "reward_std": 0.45001834630966187, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15306761860847473, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 928.6094360351562, + "completions/mean_terminated_length": 745.4363403320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8255287411432529, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14160324242092542, + "kl": 0.03265380859375, + "learning_rate": 1.8116302993252636e-07, + "loss": 0.115, + "num_tokens": 2119353577.0, + "reward": 2.424107313156128, + "reward_std": 0.4820365309715271, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16000695526599884, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 972.888427734375, + "completions/mean_terminated_length": 742.7154541015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8257418358105588, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12670570313110796, + "kl": 0.026763916015625, + "learning_rate": 1.8097135054386766e-07, + "loss": 0.106, + "num_tokens": 2119859287.0, + "reward": 2.3426339626312256, + "reward_std": 0.39663228392601013, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1591111123561859, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 924.7388916015625, + "completions/mean_terminated_length": 767.5394287109375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8259549304778648, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13285470215719886, + "kl": 0.0301513671875, + "learning_rate": 1.807798753843197e-07, + "loss": 0.0997, + "num_tokens": 2120344882.0, + "reward": 2.529576063156128, + "reward_std": 0.43511202931404114, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1053.046875, + "completions/mean_terminated_length": 878.0813598632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8261680251451707, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12155558092311321, + "kl": 0.02716064453125, + "learning_rate": 1.8058860455984936e-07, + "loss": 0.0712, + "num_tokens": 2120891719.0, + "reward": 2.4386162757873535, + "reward_std": 0.42629992961883545, + "rewards/accuracy_reward/mean": 0.5763888955116272, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.151918426156044, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 930.30810546875, + "completions/mean_terminated_length": 773.8880615234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8263811198124766, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11684979459665312, + "kl": 0.030426025390625, + "learning_rate": 1.803975381763103e-07, + "loss": 0.0823, + "num_tokens": 2121378225.0, + "reward": 2.521205425262451, + "reward_std": 0.396239697933197, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.10841450095176697, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 994.0558471679688, + "completions/mean_terminated_length": 818.3984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8265942144797827, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12165412316368807, + "kl": 0.027313232421875, + "learning_rate": 1.8020667633944325e-07, + "loss": 0.0322, + "num_tokens": 2121897642.0, + "reward": 2.368861675262451, + "reward_std": 0.41608792543411255, + "rewards/accuracy_reward/mean": 0.4441964328289032, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561823636293411, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 904.6250610351562, + "completions/mean_terminated_length": 747.9187622070312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8268073091470886, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13388085119862358, + "kl": 0.029296875, + "learning_rate": 1.8001601915487545e-07, + "loss": 0.0505, + "num_tokens": 2122371074.0, + "reward": 2.4888393878936768, + "reward_std": 0.3655345141887665, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12782442569732666, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1013.9576416015625, + "completions/mean_terminated_length": 782.286865234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8270204038143946, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12578737094431588, + "kl": 0.02886962890625, + "learning_rate": 1.798255667281213e-07, + "loss": 0.0449, + "num_tokens": 2122905439.0, + "reward": 2.4609375, + "reward_std": 0.40351077914237976, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1445406824350357, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 910.232177734375, + "completions/mean_terminated_length": 751.0025634765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.8272334984817005, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13519484017282188, + "kl": 0.02880859375, + "learning_rate": 1.7963531916458152e-07, + "loss": 0.0627, + "num_tokens": 2123381959.0, + "reward": 2.6138393878936768, + "reward_std": 0.42811423540115356, + "rewards/accuracy_reward/mean": 0.6964285969734192, + "rewards/accuracy_reward/std": 0.4603137969970703, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1010.9085083007812, + "completions/mean_terminated_length": 853.61181640625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8274465931490065, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1404572799736099, + "kl": 0.02880859375, + "learning_rate": 1.794452765695436e-07, + "loss": 0.1375, + "num_tokens": 2123909374.0, + "reward": 2.412388563156128, + "reward_std": 0.47555533051490784, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1293378323316574, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 982.68310546875, + "completions/mean_terminated_length": 811.5699462890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8276596878163124, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14423628699285146, + "kl": 0.03118896484375, + "learning_rate": 1.7925543904818151e-07, + "loss": 0.1118, + "num_tokens": 2124422320.0, + "reward": 2.5106027126312256, + "reward_std": 0.48660579323768616, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16055898368358612, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1037.466552734375, + "completions/mean_terminated_length": 807.6740112304688, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8278727824836184, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13426936768068357, + "kl": 0.0296630859375, + "learning_rate": 1.790658067055558e-07, + "loss": 0.1053, + "num_tokens": 2124952145.0, + "reward": 2.4151787757873535, + "reward_std": 0.5645465850830078, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.18115736544132233, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1094.7098388671875, + "completions/mean_terminated_length": 791.9000244140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8280858771509243, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13075497930710098, + "kl": 0.0252685546875, + "learning_rate": 1.7887637964661367e-07, + "loss": 0.0652, + "num_tokens": 2125510991.0, + "reward": 2.3348214626312256, + "reward_std": 0.44865262508392334, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16516682505607605, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 885.7388916015625, + "completions/mean_terminated_length": 702.5401000976562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8282989718182302, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14894528813990038, + "kl": 0.029815673828125, + "learning_rate": 1.786871579761881e-07, + "loss": 0.0564, + "num_tokens": 2125973210.0, + "reward": 2.4464287757873535, + "reward_std": 0.4171983301639557, + "rewards/accuracy_reward/mean": 0.5671296119689941, + "rewards/accuracy_reward/std": 0.4960475564002991, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14383503794670105, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 886.3281860351562, + "completions/mean_terminated_length": 733.7853393554688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8285120664855362, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14488186379711643, + "kl": 0.030853271484375, + "learning_rate": 1.784981417989991e-07, + "loss": 0.0297, + "num_tokens": 2126438637.0, + "reward": 2.4129464626312256, + "reward_std": 0.3363885283470154, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10110349208116531, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1064.8973388671875, + "completions/mean_terminated_length": 854.4227905273438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8287251611528421, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11943081372714248, + "kl": 0.0257568359375, + "learning_rate": 1.7830933121965258e-07, + "loss": 0.0862, + "num_tokens": 2126990415.0, + "reward": 2.46484375, + "reward_std": 0.44837912917137146, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1463720053434372, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1013.41748046875, + "completions/mean_terminated_length": 788.5081787109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8289382558201481, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11180027517430349, + "kl": 0.0274658203125, + "learning_rate": 1.7812072634264078e-07, + "loss": 0.0209, + "num_tokens": 2127520362.0, + "reward": 2.537388563156128, + "reward_std": 0.3675636649131775, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.10248777270317078, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1070.0513916015625, + "completions/mean_terminated_length": 770.6793212890625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.829151350487454, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1328632875438324, + "kl": 0.02593994140625, + "learning_rate": 1.7793232727234193e-07, + "loss": 0.083, + "num_tokens": 2128071857.0, + "reward": 2.41015625, + "reward_std": 0.41582852602005005, + "rewards/accuracy_reward/mean": 0.5347222089767456, + "rewards/accuracy_reward/std": 0.499371200799942, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14936910569667816, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1016.58935546875, + "completions/mean_terminated_length": 757.2960815429688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.82936444515476, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11711086712330981, + "kl": 0.02813720703125, + "learning_rate": 1.7774413411302058e-07, + "loss": 0.0281, + "num_tokens": 2128594937.0, + "reward": 2.2455358505249023, + "reward_std": 0.36991551518440247, + "rewards/accuracy_reward/mean": 0.3459821343421936, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12449962645769119, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 953.888427734375, + "completions/mean_terminated_length": 758.1000366210938, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.829577539822066, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14365553250977833, + "kl": 0.030548095703125, + "learning_rate": 1.7755614696882726e-07, + "loss": 0.0558, + "num_tokens": 2129085751.0, + "reward": 2.459263563156128, + "reward_std": 0.45786023139953613, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416090607643127, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 924.0402221679688, + "completions/mean_terminated_length": 743.5077514648438, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.829790634489372, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1329382005511629, + "kl": 0.03009033203125, + "learning_rate": 1.7736836594379847e-07, + "loss": 0.0566, + "num_tokens": 2129570265.0, + "reward": 2.560267925262451, + "reward_std": 0.36560624837875366, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09178353101015091, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1041.685302734375, + "completions/mean_terminated_length": 816.2267456054688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8300037291566779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13223365330253428, + "kl": 0.02740478515625, + "learning_rate": 1.7718079114185662e-07, + "loss": 0.0836, + "num_tokens": 2130105852.0, + "reward": 2.4140625, + "reward_std": 0.48807382583618164, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1659623235464096, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 963.3527221679688, + "completions/mean_terminated_length": 701.9556884765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8302168238239838, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11031613100030942, + "kl": 0.027923583984375, + "learning_rate": 1.7699342266681e-07, + "loss": 0.046, + "num_tokens": 2130603482.0, + "reward": 2.4732143878936768, + "reward_std": 0.359651654958725, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407235741615295, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12383606284856796, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1012.5535888671875, + "completions/mean_terminated_length": 824.042236328125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8304299184912898, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12201883068951357, + "kl": 0.02777099609375, + "learning_rate": 1.7680626062235266e-07, + "loss": 0.061, + "num_tokens": 2131128850.0, + "reward": 2.4386162757873535, + "reward_std": 0.40508919954299927, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11345604062080383, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1056.07373046875, + "completions/mean_terminated_length": 846.9649047851562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8306430131585957, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10846980145606772, + "kl": 0.024993896484375, + "learning_rate": 1.7661930511206463e-07, + "loss": 0.0484, + "num_tokens": 2131669491.0, + "reward": 2.502232313156128, + "reward_std": 0.34739798307418823, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10247711092233658, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1018.8906860351562, + "completions/mean_terminated_length": 774.4061279296875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8308561078259017, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12667823279699056, + "kl": 0.02667236328125, + "learning_rate": 1.7643255623941117e-07, + "loss": 0.0472, + "num_tokens": 2132197682.0, + "reward": 2.3253350257873535, + "reward_std": 0.40768349170684814, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1475696861743927, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1061.3773193359375, + "completions/mean_terminated_length": 826.9862060546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8310692024932076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14445302679487643, + "kl": 0.024932861328125, + "learning_rate": 1.762460141077438e-07, + "loss": 0.0949, + "num_tokens": 2132741211.0, + "reward": 2.251674175262451, + "reward_std": 0.41256797313690186, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14025694131851196, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1119.9754638671875, + "completions/mean_terminated_length": 866.8778686523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8312822971605136, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12563825568648504, + "kl": 0.0252685546875, + "learning_rate": 1.7605967882029932e-07, + "loss": 0.0819, + "num_tokens": 2133317744.0, + "reward": 2.2862725257873535, + "reward_std": 0.5163499712944031, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13598167896270752, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1097.65625, + "completions/mean_terminated_length": 828.0745239257812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8314953918278195, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1269563376185568, + "kl": 0.026123046875, + "learning_rate": 1.758735504801997e-07, + "loss": 0.0667, + "num_tokens": 2133883350.0, + "reward": 2.3973214626312256, + "reward_std": 0.42701372504234314, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1561690717935562, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1003.685302734375, + "completions/mean_terminated_length": 755.5884399414062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8317084864951254, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11602401799526892, + "kl": 0.02911376953125, + "learning_rate": 1.7568762919045306e-07, + "loss": 0.0345, + "num_tokens": 2134405209.0, + "reward": 2.423549175262451, + "reward_std": 0.34538063406944275, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12493880838155746, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1042.6942138671875, + "completions/mean_terminated_length": 782.8960571289062, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8319215811624314, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12879749600557552, + "kl": 0.026885986328125, + "learning_rate": 1.755019150539524e-07, + "loss": 0.0564, + "num_tokens": 2134939952.0, + "reward": 2.388951063156128, + "reward_std": 0.3237634301185608, + "rewards/accuracy_reward/mean": 0.46296295523643494, + "rewards/accuracy_reward/std": 0.49920445680618286, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1098259910941124, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 944.3370971679688, + "completions/mean_terminated_length": 776.9434204101562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8321346758297373, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1388903616038368, + "kl": 0.030670166015625, + "learning_rate": 1.7531640817347659e-07, + "loss": 0.1158, + "num_tokens": 2135432247.0, + "reward": 2.5262277126312256, + "reward_std": 0.4996793866157532, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14611591398715973, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1064.638427734375, + "completions/mean_terminated_length": 844.3223876953125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8323477704970433, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1257078028503783, + "kl": 0.025299072265625, + "learning_rate": 1.7513110865168902e-07, + "loss": 0.0656, + "num_tokens": 2135979605.0, + "reward": 2.4140625, + "reward_std": 0.4264105260372162, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12695714831352234, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 925.1942138671875, + "completions/mean_terminated_length": 703.0347900390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8325608651643492, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13144413664802096, + "kl": 0.029876708984375, + "learning_rate": 1.7494601659113915e-07, + "loss": 0.0315, + "num_tokens": 2136461692.0, + "reward": 2.4776787757873535, + "reward_std": 0.3364052176475525, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11849905550479889, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 858.0535888671875, + "completions/mean_terminated_length": 670.490966796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8327739598316553, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13635299207740328, + "kl": 0.035430908203125, + "learning_rate": 1.7476113209426118e-07, + "loss": 0.047, + "num_tokens": 2136913172.0, + "reward": 2.451451063156128, + "reward_std": 0.4038492739200592, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1273927539587021, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1030.8795166015625, + "completions/mean_terminated_length": 775.1787719726562, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8329870544989612, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11100953713700229, + "kl": 0.027191162109375, + "learning_rate": 1.745764552633745e-07, + "loss": 0.0602, + "num_tokens": 2137446350.0, + "reward": 2.44921875, + "reward_std": 0.34960824251174927, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09953393787145615, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 872.1563110351562, + "completions/mean_terminated_length": 714.3848266601562, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8332001491662672, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11625744194766788, + "kl": 0.0311279296875, + "learning_rate": 1.7439198620068362e-07, + "loss": 0.0208, + "num_tokens": 2137908932.0, + "reward": 2.564732313156128, + "reward_std": 0.4211812913417816, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10649171471595764, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1030.837158203125, + "completions/mean_terminated_length": 823.029541015625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8334132438335731, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13610590295865335, + "kl": 0.02581787109375, + "learning_rate": 1.7420772500827795e-07, + "loss": 0.08, + "num_tokens": 2138452475.0, + "reward": 2.4207589626312256, + "reward_std": 0.4782007038593292, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1555563360452652, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1043.8035888671875, + "completions/mean_terminated_length": 801.7949829101562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.833626338500879, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11679535852442592, + "kl": 0.0257568359375, + "learning_rate": 1.740236717881322e-07, + "loss": 0.0624, + "num_tokens": 2138990803.0, + "reward": 2.33984375, + "reward_std": 0.36936238408088684, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134778022766, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10500273108482361, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 840.1317138671875, + "completions/mean_terminated_length": 678.0632934570312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.833839433168185, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14948153743774778, + "kl": 0.033111572265625, + "learning_rate": 1.7383982664210556e-07, + "loss": 0.1414, + "num_tokens": 2139434814.0, + "reward": 2.536830425262451, + "reward_std": 0.4081796109676361, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.4822678565979004, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.1194591298699379, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1040.1898193359375, + "completions/mean_terminated_length": 811.0164794921875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.8340525278354909, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1282646551445139, + "kl": 0.027313232421875, + "learning_rate": 1.7365618967194208e-07, + "loss": 0.0811, + "num_tokens": 2139973907.0, + "reward": 2.421875, + "reward_std": 0.460305392742157, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13376134634017944, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 961.0558471679688, + "completions/mean_terminated_length": 786.4688720703125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8342656225027969, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15055271343526974, + "kl": 0.028961181640625, + "learning_rate": 1.7347276097927105e-07, + "loss": 0.0922, + "num_tokens": 2140476140.0, + "reward": 2.376674175262451, + "reward_std": 0.47188296914100647, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14989471435546875, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1180.337158203125, + "completions/mean_terminated_length": 870.081787109375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8344787171701028, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.9690972641212007, + "kl": 0.04638671875, + "learning_rate": 1.732895406656062e-07, + "loss": 0.0788, + "num_tokens": 2141074611.0, + "reward": 2.318638563156128, + "reward_std": 0.47845640778541565, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.17149938642978668, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 979.700927734375, + "completions/mean_terminated_length": 733.1703491210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8346918118374088, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13941158294168482, + "kl": 0.0284423828125, + "learning_rate": 1.7310652883234584e-07, + "loss": 0.0813, + "num_tokens": 2141581901.0, + "reward": 2.396205425262451, + "reward_std": 0.388677179813385, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11015089601278305, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 962.8147583007812, + "completions/mean_terminated_length": 719.685791015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8349049065047147, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1373743387548222, + "kl": 0.030120849609375, + "learning_rate": 1.729237255807729e-07, + "loss": 0.0897, + "num_tokens": 2142078874.0, + "reward": 2.4246652126312256, + "reward_std": 0.4762446880340576, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.126290425658226, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1038.0535888671875, + "completions/mean_terminated_length": 825.14599609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8351180011720206, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15000978176844298, + "kl": 0.025421142578125, + "learning_rate": 1.7274113101205523e-07, + "loss": 0.0299, + "num_tokens": 2142618370.0, + "reward": 2.4190850257873535, + "reward_std": 0.38210952281951904, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.09933306276798248, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 925.8326416015625, + "completions/mean_terminated_length": 725.0237426757812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8353310958393266, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1339903724570275, + "kl": 0.029205322265625, + "learning_rate": 1.7255874522724494e-07, + "loss": 0.0442, + "num_tokens": 2143101927.0, + "reward": 2.5066964626312256, + "reward_std": 0.4270004332065582, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12717820703983307, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 992.63623046875, + "completions/mean_terminated_length": 780.431640625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8355441905066325, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11098405181852714, + "kl": 0.02679443359375, + "learning_rate": 1.7237656832727826e-07, + "loss": 0.0739, + "num_tokens": 2143612724.0, + "reward": 2.513951063156128, + "reward_std": 0.36678698658943176, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10326440632343292, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 919.18310546875, + "completions/mean_terminated_length": 747.9743041992188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8357572851739385, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14706129399313073, + "kl": 0.030242919921875, + "learning_rate": 1.7219460041297657e-07, + "loss": 0.0939, + "num_tokens": 2144090758.0, + "reward": 2.4380581378936768, + "reward_std": 0.4117111563682556, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.14050593972206116, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 904.6004638671875, + "completions/mean_terminated_length": 751.1823120117188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8359703798412444, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1262382231061428, + "kl": 0.029693603515625, + "learning_rate": 1.7201284158504497e-07, + "loss": 0.0713, + "num_tokens": 2144572451.0, + "reward": 2.4073662757873535, + "reward_std": 0.40641263127326965, + "rewards/accuracy_reward/mean": 0.5300925970077515, + "rewards/accuracy_reward/std": 0.4996722638607025, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13295157253742218, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1057.93310546875, + "completions/mean_terminated_length": 809.0335083007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8361834745085505, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12624698429513712, + "kl": 0.025390625, + "learning_rate": 1.7183129194407317e-07, + "loss": 0.123, + "num_tokens": 2145108485.0, + "reward": 2.384486675262451, + "reward_std": 0.40201154351234436, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791021347046, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14585080742835999, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1012.5535888671875, + "completions/mean_terminated_length": 810.9866333007812, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8363965691758564, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11714841837828893, + "kl": 0.024200439453125, + "learning_rate": 1.716499515905349e-07, + "loss": 0.0245, + "num_tokens": 2145634333.0, + "reward": 2.453125, + "reward_std": 0.35915809869766235, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11771687865257263, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1052.7210693359375, + "completions/mean_terminated_length": 836.3560180664062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8366096638431624, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1269775391191213, + "kl": 0.02728271484375, + "learning_rate": 1.7146882062478807e-07, + "loss": 0.077, + "num_tokens": 2146175296.0, + "reward": 2.4598214626312256, + "reward_std": 0.40136730670928955, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10517053306102753, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1059.265625, + "completions/mean_terminated_length": 831.09619140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8368227585104683, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13080246427676054, + "kl": 0.026092529296875, + "learning_rate": 1.7128789914707504e-07, + "loss": 0.0467, + "num_tokens": 2146721095.0, + "reward": 2.3604912757873535, + "reward_std": 0.43019357323646545, + "rewards/accuracy_reward/mean": 0.46990740299224854, + "rewards/accuracy_reward/std": 0.4996722638607025, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12177752703428268, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1110.8125, + "completions/mean_terminated_length": 771.8297729492188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8370358531777742, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11130727770213093, + "kl": 0.024932861328125, + "learning_rate": 1.7110718725752188e-07, + "loss": 0.0549, + "num_tokens": 2147287747.0, + "reward": 2.380580425262451, + "reward_std": 0.37004560232162476, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13399910926818848, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1059.35498046875, + "completions/mean_terminated_length": 807.3473510742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8372489478450802, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14371740086122145, + "kl": 0.02691650390625, + "learning_rate": 1.7092668505613883e-07, + "loss": 0.099, + "num_tokens": 2147837106.0, + "reward": 2.364955425262451, + "reward_std": 0.49356189370155334, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14114972949028015, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 993.9777221679688, + "completions/mean_terminated_length": 771.7783813476562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8374620425123861, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13519628448879453, + "kl": 0.028076171875, + "learning_rate": 1.7074639264281998e-07, + "loss": 0.0532, + "num_tokens": 2148357624.0, + "reward": 2.3560268878936768, + "reward_std": 0.38538238406181335, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09605696052312851, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1015.138427734375, + "completions/mean_terminated_length": 833.5065307617188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8376751371796921, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12088348123742187, + "kl": 0.02813720703125, + "learning_rate": 1.705663101173434e-07, + "loss": 0.0501, + "num_tokens": 2148880438.0, + "reward": 2.5223214626312256, + "reward_std": 0.4124482572078705, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14903545379638672, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1020.04248046875, + "completions/mean_terminated_length": 793.1634521484375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.837888231846998, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.6012469312318556, + "kl": 0.031951904296875, + "learning_rate": 1.7038643757937106e-07, + "loss": 0.0768, + "num_tokens": 2149414249.0, + "reward": 2.459263563156128, + "reward_std": 0.4629363417625427, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16142748296260834, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1141.0982666015625, + "completions/mean_terminated_length": 849.49853515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.838101326514304, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12076342193411635, + "kl": 0.02386474609375, + "learning_rate": 1.7020677512844843e-07, + "loss": 0.0769, + "num_tokens": 2150004821.0, + "reward": 2.3080358505249023, + "reward_std": 0.46129682660102844, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16740410029888153, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1060.01123046875, + "completions/mean_terminated_length": 818.5028076171875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8383144211816099, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 1.2271355853532218, + "kl": 0.029296875, + "learning_rate": 1.7002732286400523e-07, + "loss": 0.0694, + "num_tokens": 2150554970.0, + "reward": 2.341517925262451, + "reward_std": 0.4885002374649048, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15456202626228333, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1033.759033203125, + "completions/mean_terminated_length": 842.7479858398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8385275158489158, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10985055987365569, + "kl": 0.025726318359375, + "learning_rate": 1.6984808088535436e-07, + "loss": 0.0465, + "num_tokens": 2151088478.0, + "reward": 2.4732143878936768, + "reward_std": 0.36709195375442505, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.1042405441403389, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1047.24560546875, + "completions/mean_terminated_length": 846.021484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8387406105162218, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12345973871857009, + "kl": 0.02716064453125, + "learning_rate": 1.6966904929169258e-07, + "loss": 0.1071, + "num_tokens": 2151627180.0, + "reward": 2.4933037757873535, + "reward_std": 0.45501869916915894, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13271193206310272, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1094.609375, + "completions/mean_terminated_length": 806.375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8389537051835277, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1304823868169798, + "kl": 0.026519775390625, + "learning_rate": 1.6949022818210012e-07, + "loss": 0.0603, + "num_tokens": 2152197485.0, + "reward": 2.34375, + "reward_std": 0.43222954869270325, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 977.43310546875, + "completions/mean_terminated_length": 741.1498413085938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8391667998508338, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13443368921811552, + "kl": 0.02630615234375, + "learning_rate": 1.6931161765554076e-07, + "loss": 0.087, + "num_tokens": 2152699743.0, + "reward": 2.3627233505249023, + "reward_std": 0.48113736510276794, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15520282089710236, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1061.430908203125, + "completions/mean_terminated_length": 827.052490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8393798945181397, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12110238324440441, + "kl": 0.027862548828125, + "learning_rate": 1.6913321781086202e-07, + "loss": 0.0699, + "num_tokens": 2153247088.0, + "reward": 2.369419813156128, + "reward_std": 0.41558340191841125, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14409084618091583, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1041.044677734375, + "completions/mean_terminated_length": 773.6610107421875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8395929891854457, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11303236677351514, + "kl": 0.026824951171875, + "learning_rate": 1.6895502874679413e-07, + "loss": 0.0191, + "num_tokens": 2153784388.0, + "reward": 2.4799108505249023, + "reward_std": 0.3943043649196625, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13888955116271973, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1026.0335693359375, + "completions/mean_terminated_length": 861.8834228515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8398060838527516, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11558990207418297, + "kl": 0.027130126953125, + "learning_rate": 1.6877705056195146e-07, + "loss": 0.0434, + "num_tokens": 2154315283.0, + "reward": 2.498326063156128, + "reward_std": 0.4462723135948181, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13440261781215668, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1085.747802734375, + "completions/mean_terminated_length": 833.6647338867188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8400191785200576, + "frac_reward_zero_std": 0.3214285969734192, + "grad_norm": 0.11042895644084216, + "kl": 0.022735595703125, + "learning_rate": 1.6859928335483114e-07, + "loss": 0.0695, + "num_tokens": 2154875634.0, + "reward": 2.4112725257873535, + "reward_std": 0.3485161364078522, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.13213567435741425, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1058.0535888671875, + "completions/mean_terminated_length": 871.6180419921875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8402322731873635, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1286800157820968, + "kl": 0.025238037109375, + "learning_rate": 1.6842172722381375e-07, + "loss": 0.0796, + "num_tokens": 2155421898.0, + "reward": 2.474888563156128, + "reward_std": 0.4429524838924408, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.150824636220932, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 970.8951416015625, + "completions/mean_terminated_length": 781.48291015625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8404453678546694, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14490422154992252, + "kl": 0.0281982421875, + "learning_rate": 1.6824438226716305e-07, + "loss": 0.1044, + "num_tokens": 2155918459.0, + "reward": 2.4642858505249023, + "reward_std": 0.42229321599006653, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1041.5670166015625, + "completions/mean_terminated_length": 855.1904296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8406584625219754, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12608539359190912, + "kl": 0.029754638671875, + "learning_rate": 1.6806724858302574e-07, + "loss": 0.0596, + "num_tokens": 2156453385.0, + "reward": 2.4921875, + "reward_std": 0.4592643678188324, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1319335699081421, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1111.3482666015625, + "completions/mean_terminated_length": 835.2254028320312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8408715571892813, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11003676468301371, + "kl": 0.023895263671875, + "learning_rate": 1.6789032626943194e-07, + "loss": 0.0975, + "num_tokens": 2157024213.0, + "reward": 2.396205425262451, + "reward_std": 0.3622308075428009, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389531940221786, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1056.4085693359375, + "completions/mean_terminated_length": 803.64990234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8410846518565873, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12024458695282089, + "kl": 0.025421142578125, + "learning_rate": 1.6771361542429463e-07, + "loss": 0.0192, + "num_tokens": 2157572252.0, + "reward": 2.385044813156128, + "reward_std": 0.3676413297653198, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.0999298095703125, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1025.8192138671875, + "completions/mean_terminated_length": 747.0426635742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8412977465238932, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12444682506885027, + "kl": 0.02740478515625, + "learning_rate": 1.6753711614540961e-07, + "loss": 0.0839, + "num_tokens": 2158101803.0, + "reward": 2.4838171005249023, + "reward_std": 0.44563043117523193, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1002.6875610351562, + "completions/mean_terminated_length": 778.8943481445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8415108411911992, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1477206465752429, + "kl": 0.02728271484375, + "learning_rate": 1.6736082853045576e-07, + "loss": 0.1316, + "num_tokens": 2158620015.0, + "reward": 2.4715402126312256, + "reward_std": 0.4965323507785797, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1546175479888916, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 936.66748046875, + "completions/mean_terminated_length": 784.352783203125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8417239358585051, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13059005084397984, + "kl": 0.029632568359375, + "learning_rate": 1.671847526769948e-07, + "loss": 0.0349, + "num_tokens": 2159102826.0, + "reward": 2.486607313156128, + "reward_std": 0.45923885703086853, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13940991461277008, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 992.4910888671875, + "completions/mean_terminated_length": 730.8189086914062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.841937030525811, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12847389270482107, + "kl": 0.028472900390625, + "learning_rate": 1.670088886824712e-07, + "loss": 0.1107, + "num_tokens": 2159612214.0, + "reward": 2.4620537757873535, + "reward_std": 0.44468992948532104, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15850186347961426, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1041.493408203125, + "completions/mean_terminated_length": 835.8629150390625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.842150125193117, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12682891652199707, + "kl": 0.0263671875, + "learning_rate": 1.6683323664421218e-07, + "loss": 0.0748, + "num_tokens": 2160149699.0, + "reward": 2.3058037757873535, + "reward_std": 0.45571643114089966, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.494759202003479, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.14658613502979279, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1037.1763916015625, + "completions/mean_terminated_length": 768.7655639648438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.842363219860423, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14278411919562792, + "kl": 0.026275634765625, + "learning_rate": 1.666577966594278e-07, + "loss": 0.0742, + "num_tokens": 2160684610.0, + "reward": 2.2650671005249023, + "reward_std": 0.44000667333602905, + "rewards/accuracy_reward/mean": 0.40509259700775146, + "rewards/accuracy_reward/std": 0.49147912859916687, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15792487561702728, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 977.9777221679688, + "completions/mean_terminated_length": 793.104736328125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.842576314527729, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15017519648572, + "kl": 0.027130126953125, + "learning_rate": 1.6648256882521078e-07, + "loss": 0.1066, + "num_tokens": 2161190568.0, + "reward": 2.424107313156128, + "reward_std": 0.46449989080429077, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17606906592845917, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 934.1160888671875, + "completions/mean_terminated_length": 741.6649169921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8427894091950349, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12163427570938457, + "kl": 0.027862548828125, + "learning_rate": 1.6630755323853597e-07, + "loss": 0.0742, + "num_tokens": 2161675580.0, + "reward": 2.537388563156128, + "reward_std": 0.4003835618495941, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15880775451660156, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 889.1607666015625, + "completions/mean_terminated_length": 713.3984375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8430025038623409, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15176354224383629, + "kl": 0.03155517578125, + "learning_rate": 1.6613274999626134e-07, + "loss": 0.1254, + "num_tokens": 2162141652.0, + "reward": 2.5184152126312256, + "reward_std": 0.4287479519844055, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13335825502872467, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1153.19873046875, + "completions/mean_terminated_length": 937.5540161132812, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8432155985296468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11292106488318103, + "kl": 0.024505615234375, + "learning_rate": 1.65958159195127e-07, + "loss": 0.0422, + "num_tokens": 2162728445.0, + "reward": 2.36328125, + "reward_std": 0.4955526888370514, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14843007922172546, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 971.6428833007812, + "completions/mean_terminated_length": 824.121826171875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8434286931969528, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14502986764941458, + "kl": 0.03533935546875, + "learning_rate": 1.6578378093175582e-07, + "loss": 0.084, + "num_tokens": 2163234909.0, + "reward": 2.3800225257873535, + "reward_std": 0.5102487206459045, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1810806840658188, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1077.015625, + "completions/mean_terminated_length": 812.2017211914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8436417878642587, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11107100531797799, + "kl": 0.025238037109375, + "learning_rate": 1.6560961530265243e-07, + "loss": 0.0661, + "num_tokens": 2163795876.0, + "reward": 2.33203125, + "reward_std": 0.35405829548835754, + "rewards/accuracy_reward/mean": 0.4040178656578064, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.11001905798912048, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1037.85498046875, + "completions/mean_terminated_length": 758.697998046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8438548825315646, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14957969269666366, + "kl": 0.0294189453125, + "learning_rate": 1.6543566240420456e-07, + "loss": 0.0955, + "num_tokens": 2164333875.0, + "reward": 2.322544813156128, + "reward_std": 0.4018864929676056, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14311718940734863, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 903.1339721679688, + "completions/mean_terminated_length": 756.0604248046875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.8440679771988706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1465855151067479, + "kl": 0.029327392578125, + "learning_rate": 1.652619223326816e-07, + "loss": 0.0587, + "num_tokens": 2164812671.0, + "reward": 2.5675225257873535, + "reward_std": 0.4471858739852905, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1119314432144165, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 963.76123046875, + "completions/mean_terminated_length": 796.0953369140625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8442810718661765, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13034256464843505, + "kl": 0.0286865234375, + "learning_rate": 1.6508839518423547e-07, + "loss": 0.0512, + "num_tokens": 2165314804.0, + "reward": 2.498326063156128, + "reward_std": 0.3727579712867737, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067523062229156, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1073.700927734375, + "completions/mean_terminated_length": 832.1615600585938, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8444941665334825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13164835590084706, + "kl": 0.0250244140625, + "learning_rate": 1.649150810549001e-07, + "loss": 0.0731, + "num_tokens": 2165862606.0, + "reward": 2.3504464626312256, + "reward_std": 0.4473680555820465, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14088857173919678, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1078.216552734375, + "completions/mean_terminated_length": 799.5430908203125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.8447072612007884, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12167559614533628, + "kl": 0.02386474609375, + "learning_rate": 1.6474198004059153e-07, + "loss": 0.055, + "num_tokens": 2166418079.0, + "reward": 2.3621652126312256, + "reward_std": 0.3951001465320587, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.13750630617141724, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 984.2656860351562, + "completions/mean_terminated_length": 790.604248046875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8449203558680944, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14158311362860126, + "kl": 0.028594970703125, + "learning_rate": 1.645690922371083e-07, + "loss": 0.0742, + "num_tokens": 2166930758.0, + "reward": 2.404017925262451, + "reward_std": 0.3562011122703552, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13165414333343506, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1012.966552734375, + "completions/mean_terminated_length": 818.039794921875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8451334505354003, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15481955372692918, + "kl": 0.029296875, + "learning_rate": 1.6439641774013013e-07, + "loss": 0.0618, + "num_tokens": 2167455095.0, + "reward": 2.4291296005249023, + "reward_std": 0.4108121693134308, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12956927716732025, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 989.8995971679688, + "completions/mean_terminated_length": 780.5427856445312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8453465452027064, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12174940001540113, + "kl": 0.027099609375, + "learning_rate": 1.642239566452192e-07, + "loss": 0.0512, + "num_tokens": 2167971322.0, + "reward": 2.4698662757873535, + "reward_std": 0.4359026849269867, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.10969661176204681, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1051.71875, + "completions/mean_terminated_length": 825.1671752929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8455596398700123, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12803480056938524, + "kl": 0.0255126953125, + "learning_rate": 1.6405170904781977e-07, + "loss": 0.0481, + "num_tokens": 2168507404.0, + "reward": 2.4799108505249023, + "reward_std": 0.4365629255771637, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1003.6920166015625, + "completions/mean_terminated_length": 807.0185546875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8457727345373182, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1087610009346226, + "kl": 0.02685546875, + "learning_rate": 1.638796750432575e-07, + "loss": 0.0582, + "num_tokens": 2169026082.0, + "reward": 2.3482143878936768, + "reward_std": 0.37354791164398193, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9866071343421936, + "rewards/tag_count_reward/std": 0.08746546506881714, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1151.779052734375, + "completions/mean_terminated_length": 838.6415405273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8459858292046242, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13314493535206487, + "kl": 0.023834228515625, + "learning_rate": 1.6370785472674016e-07, + "loss": 0.109, + "num_tokens": 2169618431.0, + "reward": 2.236607313156128, + "reward_std": 0.47649946808815, + "rewards/accuracy_reward/mean": 0.3504464328289032, + "rewards/accuracy_reward/std": 0.47764310240745544, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1657252162694931, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1014.5781860351562, + "completions/mean_terminated_length": 786.4931640625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8461989238719301, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1314455788394422, + "kl": 0.0267333984375, + "learning_rate": 1.6353624819335694e-07, + "loss": 0.072, + "num_tokens": 2170141506.0, + "reward": 2.376674175262451, + "reward_std": 0.47302737832069397, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1552460640668869, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 994.91748046875, + "completions/mean_terminated_length": 741.1273803710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8464120185392361, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13290918244610742, + "kl": 0.0291748046875, + "learning_rate": 1.6336485553807917e-07, + "loss": 0.0585, + "num_tokens": 2170656173.0, + "reward": 2.3677456378936768, + "reward_std": 0.4094289541244507, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15122967958450317, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1031.325927734375, + "completions/mean_terminated_length": 813.6640014648438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.846625113206542, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1298284687566422, + "kl": 0.02508544921875, + "learning_rate": 1.6319367685575957e-07, + "loss": 0.0846, + "num_tokens": 2171193391.0, + "reward": 2.3705358505249023, + "reward_std": 0.454289048910141, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 918.7522583007812, + "completions/mean_terminated_length": 716.6763305664062, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.846838207873848, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12502940832595105, + "kl": 0.02972412109375, + "learning_rate": 1.6302271224113213e-07, + "loss": 0.0745, + "num_tokens": 2171674480.0, + "reward": 2.5167412757873535, + "reward_std": 0.3748437762260437, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12016765028238297, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 898.825927734375, + "completions/mean_terminated_length": 717.68994140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8470513025411539, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1288293885664111, + "kl": 0.0308837890625, + "learning_rate": 1.6285196178881296e-07, + "loss": 0.0332, + "num_tokens": 2172141154.0, + "reward": 2.536830425262451, + "reward_std": 0.3844040632247925, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.13170628249645233, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1036.421875, + "completions/mean_terminated_length": 823.1702880859375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8472643972084598, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1258931991499077, + "kl": 0.0252685546875, + "learning_rate": 1.6268142559329934e-07, + "loss": 0.0943, + "num_tokens": 2172682111.0, + "reward": 2.4720983505249023, + "reward_std": 0.40248000621795654, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10269007831811905, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1012.997802734375, + "completions/mean_terminated_length": 843.6337280273438, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8474774918757658, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12520259227471073, + "kl": 0.02728271484375, + "learning_rate": 1.6251110374896993e-07, + "loss": 0.0749, + "num_tokens": 2173215390.0, + "reward": 2.5574777126312256, + "reward_std": 0.44498884677886963, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14611589908599854, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1071.118408203125, + "completions/mean_terminated_length": 835.6925048828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8476905865430717, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13233741484767567, + "kl": 0.02618408203125, + "learning_rate": 1.623409963500848e-07, + "loss": 0.0938, + "num_tokens": 2173757235.0, + "reward": 2.390625, + "reward_std": 0.444413423538208, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16025643050670624, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 949.2344360351562, + "completions/mean_terminated_length": 752.6132202148438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8479036812103777, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1385214731696971, + "kl": 0.028289794921875, + "learning_rate": 1.6217110349078535e-07, + "loss": 0.0657, + "num_tokens": 2174268956.0, + "reward": 2.388951063156128, + "reward_std": 0.46755221486091614, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14494065940380096, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1034.462158203125, + "completions/mean_terminated_length": 797.1322631835938, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.8481167758776836, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12131822339574642, + "kl": 0.025238037109375, + "learning_rate": 1.6200142526509445e-07, + "loss": 0.0844, + "num_tokens": 2174805307.0, + "reward": 2.4112725257873535, + "reward_std": 0.43319904804229736, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151519536972046, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 998.62060546875, + "completions/mean_terminated_length": 833.2144775390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8483298705449897, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12282210959777312, + "kl": 0.029052734375, + "learning_rate": 1.6183196176691595e-07, + "loss": 0.0439, + "num_tokens": 2175321521.0, + "reward": 2.4190850257873535, + "reward_std": 0.39597976207733154, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11540208011865616, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1024.310302734375, + "completions/mean_terminated_length": 781.11328125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8485429652122956, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1482734076998015, + "kl": 0.028717041015625, + "learning_rate": 1.616627130900348e-07, + "loss": 0.1002, + "num_tokens": 2175852620.0, + "reward": 2.39453125, + "reward_std": 0.5537673830986023, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9503348469734192, + "rewards/tag_count_reward/std": 0.1743151992559433, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1001.6652221679688, + "completions/mean_terminated_length": 774.2011108398438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8487560598796016, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12847984390054223, + "kl": 0.026519775390625, + "learning_rate": 1.6149367932811725e-07, + "loss": 0.0474, + "num_tokens": 2176366806.0, + "reward": 2.3956475257873535, + "reward_std": 0.409063458442688, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1310732513666153, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1042.4285888671875, + "completions/mean_terminated_length": 813.764404296875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8489691545469075, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1126761313172664, + "kl": 0.02685546875, + "learning_rate": 1.613248605747107e-07, + "loss": 0.071, + "num_tokens": 2176901846.0, + "reward": 2.3900671005249023, + "reward_std": 0.4422467350959778, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15614411234855652, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1043.0960693359375, + "completions/mean_terminated_length": 821.30517578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8491822492142134, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1269478890579014, + "kl": 0.027801513671875, + "learning_rate": 1.611562569232432e-07, + "loss": 0.0491, + "num_tokens": 2177444225.0, + "reward": 2.3900671005249023, + "reward_std": 0.5063492655754089, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16087746620178223, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 989.0223388671875, + "completions/mean_terminated_length": 748.2137451171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8493953438815194, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13698895792540383, + "kl": 0.027130126953125, + "learning_rate": 1.6098786846702393e-07, + "loss": 0.0888, + "num_tokens": 2177966763.0, + "reward": 2.4090402126312256, + "reward_std": 0.3535729944705963, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11801211535930634, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1001.3795166015625, + "completions/mean_terminated_length": 784.1563110351562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8496084385488253, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.2423596646678693, + "kl": 0.02880859375, + "learning_rate": 1.6081969529924325e-07, + "loss": 0.0538, + "num_tokens": 2178488469.0, + "reward": 2.447544813156128, + "reward_std": 0.4372623562812805, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.1088741272687912, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1128.3660888671875, + "completions/mean_terminated_length": 864.1034545898438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8498215332161313, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13649500116038935, + "kl": 0.02642822265625, + "learning_rate": 1.6065173751297202e-07, + "loss": 0.1004, + "num_tokens": 2179063609.0, + "reward": 2.3359375, + "reward_std": 0.5194458365440369, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16402535140514374, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1166.9085693359375, + "completions/mean_terminated_length": 866.1766967773438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8500346278834372, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10804173824131713, + "kl": 0.023468017578125, + "learning_rate": 1.60483995201162e-07, + "loss": 0.0284, + "num_tokens": 2179664832.0, + "reward": 2.2622768878936768, + "reward_std": 0.4234698414802551, + "rewards/accuracy_reward/mean": 0.3794642984867096, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14876297116279602, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 968.19873046875, + "completions/mean_terminated_length": 737.0216674804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8502477225507432, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11653191083215435, + "kl": 0.026947021484375, + "learning_rate": 1.6031646845664567e-07, + "loss": 0.008, + "num_tokens": 2180167145.0, + "reward": 2.4776787757873535, + "reward_std": 0.37293896079063416, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12935835123062134, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1053.680908203125, + "completions/mean_terminated_length": 834.2261352539062, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8504608172180491, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10996306012586778, + "kl": 0.027618408203125, + "learning_rate": 1.601491573721363e-07, + "loss": 0.0491, + "num_tokens": 2180703178.0, + "reward": 2.421875, + "reward_std": 0.4754781723022461, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.1700088381767273, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 971.8839721679688, + "completions/mean_terminated_length": 802.2636108398438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.850673911885355, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13696045285343755, + "kl": 0.028961181640625, + "learning_rate": 1.5998206204022792e-07, + "loss": 0.0593, + "num_tokens": 2181209350.0, + "reward": 2.439174175262451, + "reward_std": 0.45443442463874817, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1516006886959076, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 978.77685546875, + "completions/mean_terminated_length": 829.1399536132812, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.850887006552661, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11563832390311514, + "kl": 0.02716064453125, + "learning_rate": 1.5981518255339469e-07, + "loss": 0.0833, + "num_tokens": 2181717890.0, + "reward": 2.4933037757873535, + "reward_std": 0.39904528856277466, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12425874918699265, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 1083.84375, + "completions/mean_terminated_length": 854.7901000976562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8511001012199669, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14586532128799606, + "kl": 0.0250244140625, + "learning_rate": 1.596485190039919e-07, + "loss": 0.1078, + "num_tokens": 2182269388.0, + "reward": 2.361049175262451, + "reward_std": 0.5348119139671326, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9436383843421936, + "rewards/tag_count_reward/std": 0.19221055507659912, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1034.0826416015625, + "completions/mean_terminated_length": 810.3024291992188, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.8513131958872729, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13827242714387283, + "kl": 0.025970458984375, + "learning_rate": 1.5948207148425503e-07, + "loss": 0.1132, + "num_tokens": 2182804017.0, + "reward": 2.4146206378936768, + "reward_std": 0.46741876006126404, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13776934146881104, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1035.696533203125, + "completions/mean_terminated_length": 815.6304321289062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8515262905545788, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12302089425598965, + "kl": 0.02520751953125, + "learning_rate": 1.5931584008629998e-07, + "loss": 0.027, + "num_tokens": 2183336617.0, + "reward": 2.4765625, + "reward_std": 0.41702374815940857, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09310024231672287, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1064.1898193359375, + "completions/mean_terminated_length": 816.8630981445312, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.8517393852218849, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12656745337052655, + "kl": 0.026397705078125, + "learning_rate": 1.591498249021231e-07, + "loss": 0.0332, + "num_tokens": 2183880894.0, + "reward": 2.5083706378936768, + "reward_std": 0.39086827635765076, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10833819210529327, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1139.8616943359375, + "completions/mean_terminated_length": 847.8643188476562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8519524798891908, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12295284507759542, + "kl": 0.02325439453125, + "learning_rate": 1.5898402602360102e-07, + "loss": 0.1209, + "num_tokens": 2184465648.0, + "reward": 2.373326063156128, + "reward_std": 0.474073588848114, + "rewards/accuracy_reward/mean": 0.5092592835426331, + "rewards/accuracy_reward/std": 0.5004938840866089, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16232210397720337, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1128.9754638671875, + "completions/mean_terminated_length": 888.2168579101562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8521655745564968, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12440389870639547, + "kl": 0.0252685546875, + "learning_rate": 1.588184435424909e-07, + "loss": 0.088, + "num_tokens": 2185045733.0, + "reward": 2.3448662757873535, + "reward_std": 0.4449191391468048, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1576608121395111, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 985.1027221679688, + "completions/mean_terminated_length": 781.5691528320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8523786692238027, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12281072038839168, + "kl": 0.028350830078125, + "learning_rate": 1.5865307755042988e-07, + "loss": 0.0984, + "num_tokens": 2185557763.0, + "reward": 2.513951063156128, + "reward_std": 0.4216967523097992, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 960.8839721679688, + "completions/mean_terminated_length": 802.4041137695312, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.8525917638911086, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14072339744804316, + "kl": 0.0291748046875, + "learning_rate": 1.584879281389354e-07, + "loss": 0.0692, + "num_tokens": 2186057263.0, + "reward": 2.4408483505249023, + "reward_std": 0.4403288960456848, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15520283579826355, + "step": 4001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 981.5670166015625, + "completions/mean_terminated_length": 749.7337036132812, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8528048585584146, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11975955323051019, + "kl": 0.027374267578125, + "learning_rate": 1.5832299539940503e-07, + "loss": 0.0624, + "num_tokens": 2186566509.0, + "reward": 2.4564733505249023, + "reward_std": 0.41955676674842834, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13246241211891174, + "step": 4002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 995.9464721679688, + "completions/mean_terminated_length": 823.7921752929688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8530179532257205, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1453832982563269, + "kl": 0.027618408203125, + "learning_rate": 1.5815827942311634e-07, + "loss": 0.0918, + "num_tokens": 2187078837.0, + "reward": 2.404017925262451, + "reward_std": 0.5145753622055054, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16402915120124817, + "step": 4003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 922.7567138671875, + "completions/mean_terminated_length": 742.0181274414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8532310478930265, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14153947804040695, + "kl": 0.02789306640625, + "learning_rate": 1.5799378030122707e-07, + "loss": 0.0641, + "num_tokens": 2187559912.0, + "reward": 2.4693081378936768, + "reward_std": 0.3319628834724426, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11234336346387863, + "step": 4004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 950.4129638671875, + "completions/mean_terminated_length": 790.4066162109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8534441425603324, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11939264631217629, + "kl": 0.028778076171875, + "learning_rate": 1.578294981247748e-07, + "loss": 0.0253, + "num_tokens": 2188054481.0, + "reward": 2.4732143878936768, + "reward_std": 0.3730962872505188, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12083587795495987, + "step": 4005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1061.82373046875, + "completions/mean_terminated_length": 796.4220581054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8536572372276384, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10839936425407633, + "kl": 0.024200439453125, + "learning_rate": 1.5766543298467732e-07, + "loss": 0.0783, + "num_tokens": 2188595090.0, + "reward": 2.4441964626312256, + "reward_std": 0.4165492653846741, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11771687865257263, + "step": 4006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1232.796875, + "completions/mean_terminated_length": 957.8179321289062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8538703318949443, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10946990379207705, + "kl": 0.0228271484375, + "learning_rate": 1.575015849717321e-07, + "loss": 0.0377, + "num_tokens": 2189216151.0, + "reward": 2.2901787757873535, + "reward_std": 0.4693734049797058, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 4007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 987.2344360351562, + "completions/mean_terminated_length": 797.4132080078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8540834265622502, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.4228329953421936, + "kl": 0.02996826171875, + "learning_rate": 1.5733795417661624e-07, + "loss": 0.0752, + "num_tokens": 2189734496.0, + "reward": 2.446986675262451, + "reward_std": 0.3441595435142517, + "rewards/accuracy_reward/mean": 0.5578703880310059, + "rewards/accuracy_reward/std": 0.49721553921699524, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1199323758482933, + "step": 4008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1041.602783203125, + "completions/mean_terminated_length": 832.7277221679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8542965212295562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14216195771208615, + "kl": 0.027801513671875, + "learning_rate": 1.5717454068988716e-07, + "loss": 0.121, + "num_tokens": 2190267630.0, + "reward": 2.4464287757873535, + "reward_std": 0.5481163859367371, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134778022766, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.16626667976379395, + "step": 4009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 999.0000610351562, + "completions/mean_terminated_length": 788.0750732421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8545096158968621, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13551341729820773, + "kl": 0.0283203125, + "learning_rate": 1.5701134460198145e-07, + "loss": 0.0712, + "num_tokens": 2190789582.0, + "reward": 2.423549175262451, + "reward_std": 0.4042442739009857, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1330958753824234, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1047.200927734375, + "completions/mean_terminated_length": 839.4878540039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8547227105641682, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14192018750225546, + "kl": 0.026397705078125, + "learning_rate": 1.5684836600321595e-07, + "loss": 0.1099, + "num_tokens": 2191327576.0, + "reward": 2.3895089626312256, + "reward_std": 0.46169066429138184, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.14629201591014862, + "step": 4011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1074.7054443359375, + "completions/mean_terminated_length": 833.4150390625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.854935805231474, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14206528118573122, + "kl": 0.02886962890625, + "learning_rate": 1.5668560498378652e-07, + "loss": 0.0556, + "num_tokens": 2191879956.0, + "reward": 2.275669813156128, + "reward_std": 0.3963678479194641, + "rewards/accuracy_reward/mean": 0.3883928656578064, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1464626044034958, + "step": 4012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 985.8995971679688, + "completions/mean_terminated_length": 755.0081787109375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8551488998987801, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1391856678359491, + "kl": 0.028564453125, + "learning_rate": 1.5652306163376918e-07, + "loss": 0.053, + "num_tokens": 2192390631.0, + "reward": 2.4441964626312256, + "reward_std": 0.4495397210121155, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13636787235736847, + "step": 4013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 923.2701416015625, + "completions/mean_terminated_length": 785.1453857421875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.855361994566086, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13136577836924435, + "kl": 0.03045654296875, + "learning_rate": 1.5636073604311911e-07, + "loss": 0.0627, + "num_tokens": 2192871920.0, + "reward": 2.4659600257873535, + "reward_std": 0.39481469988822937, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10073082149028778, + "step": 4014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1071.0670166015625, + "completions/mean_terminated_length": 808.1529541015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.855575089233392, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.127695395459119, + "kl": 0.026336669921875, + "learning_rate": 1.5619862830167116e-07, + "loss": 0.1324, + "num_tokens": 2193428942.0, + "reward": 2.3543527126312256, + "reward_std": 0.4791010320186615, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9458705186843872, + "rewards/tag_count_reward/std": 0.17854657769203186, + "step": 4015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1148.6138916015625, + "completions/mean_terminated_length": 883.4768676757812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8557881839006979, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.33400718996469586, + "kl": 0.027435302734375, + "learning_rate": 1.5603673849913945e-07, + "loss": 0.0968, + "num_tokens": 2194019073.0, + "reward": 2.28125, + "reward_std": 0.4129432439804077, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16370916366577148, + "step": 4016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 977.9152221679688, + "completions/mean_terminated_length": 779.7512817382812, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.8560012785680038, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13713082830861867, + "kl": 0.0303955078125, + "learning_rate": 1.5587506672511763e-07, + "loss": 0.0679, + "num_tokens": 2194528011.0, + "reward": 2.490513563156128, + "reward_std": 0.448030561208725, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 4017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1121.134033203125, + "completions/mean_terminated_length": 854.7930908203125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8562143732353098, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1299457331066092, + "kl": 0.024169921875, + "learning_rate": 1.5571361306907883e-07, + "loss": 0.0621, + "num_tokens": 2195104647.0, + "reward": 2.3002233505249023, + "reward_std": 0.42757025361061096, + "rewards/accuracy_reward/mean": 0.39814814925193787, + "rewards/accuracy_reward/std": 0.49008384346961975, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12177752703428268, + "step": 4018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 954.7410888671875, + "completions/mean_terminated_length": 772.53125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8564274679026157, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13356461116682083, + "kl": 0.030059814453125, + "learning_rate": 1.555523776203751e-07, + "loss": 0.0637, + "num_tokens": 2195598595.0, + "reward": 2.474888563156128, + "reward_std": 0.4388851523399353, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416090607643127, + "step": 4019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1065.2210693359375, + "completions/mean_terminated_length": 797.1903686523438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8566405625699217, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11481451753985222, + "kl": 0.02593994140625, + "learning_rate": 1.5539136046823783e-07, + "loss": 0.0956, + "num_tokens": 2196149654.0, + "reward": 2.365513563156128, + "reward_std": 0.47923123836517334, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.18044531345367432, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1098.0045166015625, + "completions/mean_terminated_length": 881.9780883789062, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.8568536572372276, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12223149494792568, + "kl": 0.0257568359375, + "learning_rate": 1.5523056170177796e-07, + "loss": 0.0706, + "num_tokens": 2196714216.0, + "reward": 2.4458706378936768, + "reward_std": 0.40560591220855713, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12315750867128372, + "step": 4021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 981.8281860351562, + "completions/mean_terminated_length": 739.3836059570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8570667519045336, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12752028982107383, + "kl": 0.02850341796875, + "learning_rate": 1.5506998140998516e-07, + "loss": 0.0994, + "num_tokens": 2197222251.0, + "reward": 2.41796875, + "reward_std": 0.37595540285110474, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 4022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 945.5313110351562, + "completions/mean_terminated_length": 758.42822265625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8572798465718395, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13000654982989374, + "kl": 0.026336669921875, + "learning_rate": 1.5490961968172827e-07, + "loss": 0.0437, + "num_tokens": 2197715737.0, + "reward": 2.3995537757873535, + "reward_std": 0.340129554271698, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13318143784999847, + "step": 4023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 917.3772583007812, + "completions/mean_terminated_length": 728.9401245117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8574929412391455, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11967357958522949, + "kl": 0.030059814453125, + "learning_rate": 1.5474947660575528e-07, + "loss": 0.0346, + "num_tokens": 2198205186.0, + "reward": 2.486049175262451, + "reward_std": 0.356396347284317, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12130890041589737, + "step": 4024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 942.1652221679688, + "completions/mean_terminated_length": 757.859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8577060359064514, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14682389463246903, + "kl": 0.029754638671875, + "learning_rate": 1.545895522706932e-07, + "loss": 0.0814, + "num_tokens": 2198700252.0, + "reward": 2.4698662757873535, + "reward_std": 0.4171362817287445, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13763903081417084, + "step": 4025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1107.4888916015625, + "completions/mean_terminated_length": 850.98583984375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8579191305737573, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.45531164519502776, + "kl": 0.0255126953125, + "learning_rate": 1.5442984676504795e-07, + "loss": 0.1193, + "num_tokens": 2199280679.0, + "reward": 2.3565850257873535, + "reward_std": 0.43994763493537903, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14126798510551453, + "step": 4026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1008.13623046875, + "completions/mean_terminated_length": 799.0482788085938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8581322252410634, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11304294686787422, + "kl": 0.026153564453125, + "learning_rate": 1.5427036017720398e-07, + "loss": 0.0506, + "num_tokens": 2199799348.0, + "reward": 2.4972100257873535, + "reward_std": 0.39899393916130066, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 4027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 950.26123046875, + "completions/mean_terminated_length": 763.9608764648438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8583453199083693, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12212884101555303, + "kl": 0.02569580078125, + "learning_rate": 1.5411109259542526e-07, + "loss": 0.0683, + "num_tokens": 2200299513.0, + "reward": 2.489955425262451, + "reward_std": 0.4126487672328949, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1302192062139511, + "step": 4028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1007.4531860351562, + "completions/mean_terminated_length": 738.5477905273438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8585584145756753, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1318979964231388, + "kl": 0.025848388671875, + "learning_rate": 1.5395204410785395e-07, + "loss": 0.0859, + "num_tokens": 2200834964.0, + "reward": 2.4308037757873535, + "reward_std": 0.3784793019294739, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13629460334777832, + "step": 4029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 956.7567138671875, + "completions/mean_terminated_length": 764.8582763671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8587715092429812, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1293461130825448, + "kl": 0.02734375, + "learning_rate": 1.5379321480251156e-07, + "loss": 0.0693, + "num_tokens": 2201339367.0, + "reward": 2.364955425262451, + "reward_std": 0.33711546659469604, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13083133101463318, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 973.1785888671875, + "completions/mean_terminated_length": 780.8421630859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8589846039102872, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14945580670777017, + "kl": 0.030029296875, + "learning_rate": 1.5363460476729764e-07, + "loss": 0.0928, + "num_tokens": 2201844359.0, + "reward": 2.505580425262451, + "reward_std": 0.4236098825931549, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12247265130281448, + "step": 4031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 998.0982666015625, + "completions/mean_terminated_length": 776.767578125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8591976985775931, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1279465561155509, + "kl": 0.0279541015625, + "learning_rate": 1.534762140899907e-07, + "loss": 0.0639, + "num_tokens": 2202359331.0, + "reward": 2.4308037757873535, + "reward_std": 0.431491881608963, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 4032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 961.154052734375, + "completions/mean_terminated_length": 759.88623046875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.859410793244899, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1443480419386777, + "kl": 0.030059814453125, + "learning_rate": 1.5331804285824802e-07, + "loss": 0.0629, + "num_tokens": 2202854872.0, + "reward": 2.4135046005249023, + "reward_std": 0.42508289217948914, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1328611671924591, + "step": 4033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 892.2589721679688, + "completions/mean_terminated_length": 730.5139770507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.859623887912205, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12953588915893907, + "kl": 0.03033447265625, + "learning_rate": 1.5316009115960522e-07, + "loss": 0.0586, + "num_tokens": 2203327676.0, + "reward": 2.4520089626312256, + "reward_std": 0.4128343164920807, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14360485970973969, + "step": 4034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 989.91748046875, + "completions/mean_terminated_length": 816.776611328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8598369825795109, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1388375747613047, + "kl": 0.029083251953125, + "learning_rate": 1.5300235908147646e-07, + "loss": 0.0611, + "num_tokens": 2203844151.0, + "reward": 2.458705425262451, + "reward_std": 0.5002565383911133, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16460275650024414, + "step": 4035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 970.0178833007812, + "completions/mean_terminated_length": 773.7625732421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8600500772468169, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1397498921715073, + "kl": 0.026153564453125, + "learning_rate": 1.5284484671115426e-07, + "loss": 0.0779, + "num_tokens": 2204349807.0, + "reward": 2.3431921005249023, + "reward_std": 0.3593369722366333, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10744724422693253, + "step": 4036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1001.029052734375, + "completions/mean_terminated_length": 807.1455078125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8602631719141228, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.6891131788309965, + "kl": 0.034027099609375, + "learning_rate": 1.5268755413580997e-07, + "loss": 0.0802, + "num_tokens": 2204867116.0, + "reward": 2.505580425262451, + "reward_std": 0.4100479483604431, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1236090287566185, + "step": 4037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1031.794677734375, + "completions/mean_terminated_length": 786.8919677734375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8604762665814288, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12196421656826449, + "kl": 0.02734375, + "learning_rate": 1.525304814424927e-07, + "loss": 0.0866, + "num_tokens": 2205397152.0, + "reward": 2.3872768878936768, + "reward_std": 0.43952134251594543, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.19385726749897003, + "step": 4038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1068.8795166015625, + "completions/mean_terminated_length": 849.5136108398438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8606893612487347, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1257717609565754, + "kl": 0.0247802734375, + "learning_rate": 1.523736287181302e-07, + "loss": 0.0791, + "num_tokens": 2205954490.0, + "reward": 2.3231027126312256, + "reward_std": 0.37952110171318054, + "rewards/accuracy_reward/mean": 0.4236111044883728, + "rewards/accuracy_reward/std": 0.4947032034397125, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1149359717965126, + "step": 4039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 916.2500610351562, + "completions/mean_terminated_length": 731.0545043945312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8609024559160408, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1383920336400597, + "kl": 0.030181884765625, + "learning_rate": 1.5221699604952856e-07, + "loss": 0.1064, + "num_tokens": 2206430010.0, + "reward": 2.4308037757873535, + "reward_std": 0.4416918158531189, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14480386674404144, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 911.74560546875, + "completions/mean_terminated_length": 756.0151977539062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8611155505833467, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12813956034508955, + "kl": 0.028961181640625, + "learning_rate": 1.520605835233719e-07, + "loss": 0.046, + "num_tokens": 2206903432.0, + "reward": 2.4988839626312256, + "reward_std": 0.3404530882835388, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09750169515609741, + "step": 4041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 984.029052734375, + "completions/mean_terminated_length": 773.5106811523438, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.8613286452506526, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14022510955042372, + "kl": 0.0284423828125, + "learning_rate": 1.5190439122622257e-07, + "loss": 0.0865, + "num_tokens": 2207416837.0, + "reward": 2.4168527126312256, + "reward_std": 0.4671056568622589, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.17125900089740753, + "step": 4042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 985.716552734375, + "completions/mean_terminated_length": 802.1806640625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8615417399179586, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11890608651185142, + "kl": 0.027618408203125, + "learning_rate": 1.5174841924452115e-07, + "loss": 0.0549, + "num_tokens": 2207922374.0, + "reward": 2.5362725257873535, + "reward_std": 0.3677121102809906, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11109180748462677, + "step": 4043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1043.609375, + "completions/mean_terminated_length": 818.5819702148438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8617548345852645, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1353989344355892, + "kl": 0.02630615234375, + "learning_rate": 1.5159266766458598e-07, + "loss": 0.0791, + "num_tokens": 2208453927.0, + "reward": 2.3861608505249023, + "reward_std": 0.4289137125015259, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1457662582397461, + "step": 4044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 984.763427734375, + "completions/mean_terminated_length": 770.9758911132812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8619679292525705, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13333855636475003, + "kl": 0.026214599609375, + "learning_rate": 1.5143713657261396e-07, + "loss": 0.1046, + "num_tokens": 2208973277.0, + "reward": 2.4921875, + "reward_std": 0.4110495150089264, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559208810329437, + "step": 4045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1120.899658203125, + "completions/mean_terminated_length": 871.3966064453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8621810239198764, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1127224049547244, + "kl": 0.024688720703125, + "learning_rate": 1.5128182605467928e-07, + "loss": 0.0746, + "num_tokens": 2209552000.0, + "reward": 2.4112725257873535, + "reward_std": 0.4183811545372009, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 4046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 882.4152221679688, + "completions/mean_terminated_length": 712.4961547851562, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8623941185871824, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1220537501274085, + "kl": 0.03216552734375, + "learning_rate": 1.511267361967347e-07, + "loss": 0.0441, + "num_tokens": 2210016330.0, + "reward": 2.5005581378936768, + "reward_std": 0.4250258505344391, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1354389488697052, + "step": 4047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1139.107177734375, + "completions/mean_terminated_length": 857.4035034179688, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.8626072132544883, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12305608743846658, + "kl": 0.024627685546875, + "learning_rate": 1.5097186708461047e-07, + "loss": 0.0721, + "num_tokens": 2210598410.0, + "reward": 2.4252233505249023, + "reward_std": 0.44480597972869873, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14835961163043976, + "step": 4048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 935.1451416015625, + "completions/mean_terminated_length": 725.5623168945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8628203079217942, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13155395863622382, + "kl": 0.0291748046875, + "learning_rate": 1.5081721880401483e-07, + "loss": 0.0701, + "num_tokens": 2211091259.0, + "reward": 2.411830425262451, + "reward_std": 0.3875403106212616, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12805373966693878, + "step": 4049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 895.747802734375, + "completions/mean_terminated_length": 710.6709594726562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8630334025891002, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14001871328866852, + "kl": 0.031829833984375, + "learning_rate": 1.5066279144053372e-07, + "loss": 0.1001, + "num_tokens": 2211562970.0, + "reward": 2.431919813156128, + "reward_std": 0.4137674868106842, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13559210300445557, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 909.5982666015625, + "completions/mean_terminated_length": 750.2799072265625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8632464972564061, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1543959946973897, + "kl": 0.0308837890625, + "learning_rate": 1.505085850796308e-07, + "loss": 0.0609, + "num_tokens": 2212036614.0, + "reward": 2.5345983505249023, + "reward_std": 0.37925994396209717, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11141301691532135, + "step": 4051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 943.8951416015625, + "completions/mean_terminated_length": 779.6948852539062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8634595919237121, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13608682055528662, + "kl": 0.03076171875, + "learning_rate": 1.503545998066477e-07, + "loss": 0.0756, + "num_tokens": 2212527511.0, + "reward": 2.553013563156128, + "reward_std": 0.42629748582839966, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13878051936626434, + "step": 4052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 912.0625610351562, + "completions/mean_terminated_length": 729.606201171875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.863672686591018, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14699163514256142, + "kl": 0.028564453125, + "learning_rate": 1.5020083570680333e-07, + "loss": 0.0792, + "num_tokens": 2212998979.0, + "reward": 2.4771206378936768, + "reward_std": 0.4592069685459137, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14177079498767853, + "step": 4053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1068.8482666015625, + "completions/mean_terminated_length": 842.89013671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.863885781258324, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12627791405463323, + "kl": 0.025146484375, + "learning_rate": 1.5004729286519444e-07, + "loss": 0.0932, + "num_tokens": 2213546815.0, + "reward": 2.349330425262451, + "reward_std": 0.47795534133911133, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16375111043453217, + "step": 4054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1068.921875, + "completions/mean_terminated_length": 852.8310546875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.86409887592563, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12607570905945265, + "kl": 0.026458740234375, + "learning_rate": 1.4989397136679513e-07, + "loss": 0.0806, + "num_tokens": 2214106588.0, + "reward": 2.3191964626312256, + "reward_std": 0.4842020571231842, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1554640233516693, + "step": 4055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1020.15185546875, + "completions/mean_terminated_length": 765.3370361328125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.864311970592936, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1148854580066887, + "kl": 0.024993896484375, + "learning_rate": 1.4974087129645716e-07, + "loss": 0.0745, + "num_tokens": 2214635120.0, + "reward": 2.4112725257873535, + "reward_std": 0.3464162051677704, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493200302124, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067523062229156, + "step": 4056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 944.8281860351562, + "completions/mean_terminated_length": 747.41845703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8645250652602419, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13450169531886802, + "kl": 0.028411865234375, + "learning_rate": 1.495879927389097e-07, + "loss": 0.0683, + "num_tokens": 2215123251.0, + "reward": 2.58203125, + "reward_std": 0.3914824426174164, + "rewards/accuracy_reward/mean": 0.6696428656578064, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13311463594436646, + "step": 4057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1048.919677734375, + "completions/mean_terminated_length": 797.754150390625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8647381599275478, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12004838276080283, + "kl": 0.024200439453125, + "learning_rate": 1.4943533577875927e-07, + "loss": 0.0733, + "num_tokens": 2215665343.0, + "reward": 2.377232313156128, + "reward_std": 0.4044736325740814, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.14033812284469604, + "step": 4058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 928.3482666015625, + "completions/mean_terminated_length": 727.989501953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8649512545948538, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12773242726153758, + "kl": 0.028564453125, + "learning_rate": 1.4928290050048994e-07, + "loss": 0.0741, + "num_tokens": 2216149323.0, + "reward": 2.4268975257873535, + "reward_std": 0.4526580572128296, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517839670181274, + "step": 4059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 977.122802734375, + "completions/mean_terminated_length": 792.1021118164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8651643492621597, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1489828260238368, + "kl": 0.02642822265625, + "learning_rate": 1.4913068698846287e-07, + "loss": 0.0681, + "num_tokens": 2216659266.0, + "reward": 2.2472100257873535, + "reward_std": 0.38584235310554504, + "rewards/accuracy_reward/mean": 0.3415178656578064, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12493880838155746, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1058.872802734375, + "completions/mean_terminated_length": 810.20947265625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8653774439294657, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11983759944217881, + "kl": 0.026092529296875, + "learning_rate": 1.4897869532691669e-07, + "loss": 0.0421, + "num_tokens": 2217200825.0, + "reward": 2.4832589626312256, + "reward_std": 0.4228314757347107, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13189572095870972, + "step": 4061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 951.15185546875, + "completions/mean_terminated_length": 737.6320190429688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8655905385967716, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12969772532712365, + "kl": 0.03009033203125, + "learning_rate": 1.4882692559996705e-07, + "loss": 0.0501, + "num_tokens": 2217689229.0, + "reward": 2.4056921005249023, + "reward_std": 0.39541152119636536, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13254131376743317, + "step": 4062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 976.779052734375, + "completions/mean_terminated_length": 768.2479858398438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8658036332640776, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.137910207218541, + "kl": 0.027587890625, + "learning_rate": 1.4867537789160683e-07, + "loss": 0.1017, + "num_tokens": 2218192426.0, + "reward": 2.4129464626312256, + "reward_std": 0.4871104657649994, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 4063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 962.6875610351562, + "completions/mean_terminated_length": 768.4736938476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8660167279313835, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15066555273569057, + "kl": 0.029510498046875, + "learning_rate": 1.4852405228570635e-07, + "loss": 0.1312, + "num_tokens": 2218688814.0, + "reward": 2.4140625, + "reward_std": 0.4451924264431, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13455694913864136, + "step": 4064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 927.9553833007812, + "completions/mean_terminated_length": 741.28125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8662298225986894, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1420741969141825, + "kl": 0.02874755859375, + "learning_rate": 1.4837294886601236e-07, + "loss": 0.0326, + "num_tokens": 2219174778.0, + "reward": 2.5200893878936768, + "reward_std": 0.39433616399765015, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09872953593730927, + "step": 4065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1126.0804443359375, + "completions/mean_terminated_length": 826.04736328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8664429172659954, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11289294799220716, + "kl": 0.0234375, + "learning_rate": 1.4822206771614936e-07, + "loss": 0.0444, + "num_tokens": 2219752942.0, + "reward": 2.3683037757873535, + "reward_std": 0.4109860360622406, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 4066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 923.0469360351562, + "completions/mean_terminated_length": 749.0850219726562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8666560119333013, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1296488949600664, + "kl": 0.03076171875, + "learning_rate": 1.4807140891961838e-07, + "loss": 0.0631, + "num_tokens": 2220234931.0, + "reward": 2.35546875, + "reward_std": 0.4363895654678345, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.1657317876815796, + "step": 4067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1020.357177734375, + "completions/mean_terminated_length": 779.7245483398438, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8668691066006073, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.12993528168241564, + "kl": 0.024444580078125, + "learning_rate": 1.4792097255979759e-07, + "loss": 0.0922, + "num_tokens": 2220753747.0, + "reward": 2.453125, + "reward_std": 0.3721867799758911, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 4068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 949.7232666015625, + "completions/mean_terminated_length": 770.0051879882812, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8670822012679132, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15242613932852447, + "kl": 0.0289306640625, + "learning_rate": 1.4777075871994193e-07, + "loss": 0.1055, + "num_tokens": 2221248423.0, + "reward": 2.4380581378936768, + "reward_std": 0.501368522644043, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16202186048030853, + "step": 4069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 988.4375610351562, + "completions/mean_terminated_length": 771.9677124023438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8672952959352193, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12854758094670285, + "kl": 0.027618408203125, + "learning_rate": 1.4762076748318317e-07, + "loss": 0.0671, + "num_tokens": 2221761339.0, + "reward": 2.48046875, + "reward_std": 0.4184191823005676, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.12858274579048157, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 920.0469360351562, + "completions/mean_terminated_length": 765.4542846679688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8675083906025252, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1466931195253997, + "kl": 0.029052734375, + "learning_rate": 1.4747099893253029e-07, + "loss": 0.1236, + "num_tokens": 2222251744.0, + "reward": 2.5390625, + "reward_std": 0.4405645728111267, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.48100292682647705, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12425371259450912, + "step": 4071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1035.5848388671875, + "completions/mean_terminated_length": 844.9177856445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8677214852698312, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 6132.149576018426, + "kl": 83.02157592773438, + "learning_rate": 1.4732145315086843e-07, + "loss": 3.4164, + "num_tokens": 2222788406.0, + "reward": 2.3794643878936768, + "reward_std": 0.44413241744041443, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 4072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 952.7410888671875, + "completions/mean_terminated_length": 753.3403930664062, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.8679345799371371, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1430022607596674, + "kl": 0.02838134765625, + "learning_rate": 1.4717213022095988e-07, + "loss": 0.0693, + "num_tokens": 2223279490.0, + "reward": 2.5669643878936768, + "reward_std": 0.3638642430305481, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09585529565811157, + "step": 4073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 929.747802734375, + "completions/mean_terminated_length": 789.2637939453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.868147674604443, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12926350850774657, + "kl": 0.029693603515625, + "learning_rate": 1.470230302254434e-07, + "loss": 0.0343, + "num_tokens": 2223770865.0, + "reward": 2.505580425262451, + "reward_std": 0.3987835645675659, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11709482222795486, + "step": 4074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1012.4241333007812, + "completions/mean_terminated_length": 814.122314453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.868360769271749, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11816358792307348, + "kl": 0.028228759765625, + "learning_rate": 1.4687415324683445e-07, + "loss": 0.045, + "num_tokens": 2224292863.0, + "reward": 2.4112725257873535, + "reward_std": 0.4191451370716095, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13802284002304077, + "step": 4075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 882.62060546875, + "completions/mean_terminated_length": 709.3077392578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8685738639390549, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14272499573444483, + "kl": 0.029541015625, + "learning_rate": 1.4672549936752505e-07, + "loss": 0.1, + "num_tokens": 2224753557.0, + "reward": 2.451451063156128, + "reward_std": 0.46408915519714355, + "rewards/accuracy_reward/mean": 0.5694444179534912, + "rewards/accuracy_reward/std": 0.495728075504303, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1434776484966278, + "step": 4076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1072.8973388671875, + "completions/mean_terminated_length": 803.4244995117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8687869586063609, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13346013139901514, + "kl": 0.02606201171875, + "learning_rate": 1.4657706866978359e-07, + "loss": 0.1003, + "num_tokens": 2225307287.0, + "reward": 2.3058037757873535, + "reward_std": 0.48475760221481323, + "rewards/accuracy_reward/mean": 0.44212964177131653, + "rewards/accuracy_reward/std": 0.4972155690193176, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 4077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 959.0826416015625, + "completions/mean_terminated_length": 740.1314086914062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8690000532736668, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13354302072630625, + "kl": 0.02789306640625, + "learning_rate": 1.464288612357553e-07, + "loss": 0.0657, + "num_tokens": 2225803788.0, + "reward": 2.5089287757873535, + "reward_std": 0.3907046616077423, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11244472116231918, + "step": 4078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1036.1004638671875, + "completions/mean_terminated_length": 816.122314453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8692131479409728, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13371107186113698, + "kl": 0.02520751953125, + "learning_rate": 1.462808771474617e-07, + "loss": 0.0432, + "num_tokens": 2226339817.0, + "reward": 2.4090402126312256, + "reward_std": 0.41504529118537903, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1182868480682373, + "step": 4079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 969.2567138671875, + "completions/mean_terminated_length": 723.9534301757812, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.8694262426082787, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13785561599797685, + "kl": 0.02880859375, + "learning_rate": 1.4613311648680032e-07, + "loss": 0.0602, + "num_tokens": 2226840748.0, + "reward": 2.4955358505249023, + "reward_std": 0.44741523265838623, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.4908071458339691, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15274205803871155, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1023.2120971679688, + "completions/mean_terminated_length": 800.4320678710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8696393372755847, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11843753910217387, + "kl": 0.027740478515625, + "learning_rate": 1.4598557933554573e-07, + "loss": 0.0715, + "num_tokens": 2227365995.0, + "reward": 2.453125, + "reward_std": 0.4121227562427521, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 4081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1017.27685546875, + "completions/mean_terminated_length": 829.6253662109375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8698524319428906, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12539022499083557, + "kl": 0.027862548828125, + "learning_rate": 1.4583826577534823e-07, + "loss": 0.0661, + "num_tokens": 2227891543.0, + "reward": 2.4185268878936768, + "reward_std": 0.41020217537879944, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.10627461224794388, + "step": 4082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 947.2277221679688, + "completions/mean_terminated_length": 824.3126220703125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8700655266101965, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13509524652411065, + "kl": 0.02880859375, + "learning_rate": 1.456911758877348e-07, + "loss": 0.0863, + "num_tokens": 2228385101.0, + "reward": 2.4168527126312256, + "reward_std": 0.41891607642173767, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 4083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1086.341552734375, + "completions/mean_terminated_length": 851.2694702148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8702786212775026, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12745857006299832, + "kl": 0.026641845703125, + "learning_rate": 1.4554430975410822e-07, + "loss": 0.0828, + "num_tokens": 2228942534.0, + "reward": 2.381138563156128, + "reward_std": 0.4991455674171448, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.161164328455925, + "step": 4084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1087.680908203125, + "completions/mean_terminated_length": 849.6072387695312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8704917159448085, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11049052548951625, + "kl": 0.024383544921875, + "learning_rate": 1.4539766745574772e-07, + "loss": 0.0472, + "num_tokens": 2229505063.0, + "reward": 2.302455425262451, + "reward_std": 0.42382556200027466, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11709482222795486, + "step": 4085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 944.341552734375, + "completions/mean_terminated_length": 770.3798828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8707048106121145, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1283670473306525, + "kl": 0.03057861328125, + "learning_rate": 1.4525124907380866e-07, + "loss": 0.0917, + "num_tokens": 2229997872.0, + "reward": 2.439174175262451, + "reward_std": 0.42247307300567627, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15578390657901764, + "step": 4086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1008.1317138671875, + "completions/mean_terminated_length": 871.5833129882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8709179052794204, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10213503831026428, + "kl": 0.026458740234375, + "learning_rate": 1.451050546893224e-07, + "loss": 0.0416, + "num_tokens": 2230522539.0, + "reward": 2.4324777126312256, + "reward_std": 0.36738675832748413, + "rewards/accuracy_reward/mean": 0.5185185074806213, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10210946202278137, + "step": 4087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1063.7857666015625, + "completions/mean_terminated_length": 773.6416015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8711309999467264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12922279668882436, + "kl": 0.0257568359375, + "learning_rate": 1.4495908438319626e-07, + "loss": 0.0711, + "num_tokens": 2231074075.0, + "reward": 2.3364956378936768, + "reward_std": 0.4675809442996979, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13572438061237335, + "step": 4088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1024.953125, + "completions/mean_terminated_length": 771.3286743164062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8713440946140323, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11734569491377006, + "kl": 0.024688720703125, + "learning_rate": 1.448133382362136e-07, + "loss": 0.0598, + "num_tokens": 2231610262.0, + "reward": 2.3956475257873535, + "reward_std": 0.435901403427124, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1439727544784546, + "step": 4089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 873.9844360351562, + "completions/mean_terminated_length": 713.0786743164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8715571892813382, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.18346112444198404, + "kl": 0.0357666015625, + "learning_rate": 1.4466781632903403e-07, + "loss": 0.0644, + "num_tokens": 2232072623.0, + "reward": 2.4799108505249023, + "reward_std": 0.49145856499671936, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 884.6875610351562, + "completions/mean_terminated_length": 731.9293212890625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8717702839486442, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1313626534332317, + "kl": 0.028594970703125, + "learning_rate": 1.4452251874219245e-07, + "loss": 0.1078, + "num_tokens": 2232542643.0, + "reward": 2.501674175262451, + "reward_std": 0.36803072690963745, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09687710553407669, + "step": 4091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 817.5156860351562, + "completions/mean_terminated_length": 662.93212890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8719833786159501, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14064417787711353, + "kl": 0.031463623046875, + "learning_rate": 1.4437744555610008e-07, + "loss": 0.1035, + "num_tokens": 2232978778.0, + "reward": 2.5279018878936768, + "reward_std": 0.3254278302192688, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09750169515609741, + "step": 4092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 903.2500610351562, + "completions/mean_terminated_length": 746.3552856445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8721964732832561, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12331589925576429, + "kl": 0.03125, + "learning_rate": 1.4423259685104384e-07, + "loss": 0.0639, + "num_tokens": 2233450026.0, + "reward": 2.529017925262451, + "reward_std": 0.3627117872238159, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13686132431030273, + "step": 4093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 908.5045166015625, + "completions/mean_terminated_length": 774.9476318359375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.872409567950562, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13902813412145534, + "kl": 0.030731201171875, + "learning_rate": 1.4408797270718645e-07, + "loss": 0.1151, + "num_tokens": 2233921852.0, + "reward": 2.532924175262451, + "reward_std": 0.45692914724349976, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12428762763738632, + "step": 4094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 943.46435546875, + "completions/mean_terminated_length": 735.4482421875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.872622662617868, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1432911726822847, + "kl": 0.032684326171875, + "learning_rate": 1.4394357320456623e-07, + "loss": 0.0652, + "num_tokens": 2234420684.0, + "reward": 2.4034600257873535, + "reward_std": 0.37642720341682434, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13519906997680664, + "step": 4095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 887.747802734375, + "completions/mean_terminated_length": 676.5145263671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8728357572851739, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.15051861371665326, + "kl": 0.030609130859375, + "learning_rate": 1.437993984230973e-07, + "loss": 0.1104, + "num_tokens": 2234885371.0, + "reward": 2.513392925262451, + "reward_std": 0.4087888300418854, + "rewards/accuracy_reward/mean": 0.6273148059844971, + "rewards/accuracy_reward/std": 0.48407992720603943, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13731664419174194, + "step": 4096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 848.2254638671875, + "completions/mean_terminated_length": 680.3180541992188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8730488519524799, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1532225688412467, + "kl": 0.03289794921875, + "learning_rate": 1.4365544844256922e-07, + "loss": 0.0984, + "num_tokens": 2235332240.0, + "reward": 2.5033483505249023, + "reward_std": 0.388235867023468, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492350101471, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15374813973903656, + "step": 4097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 910.5870971679688, + "completions/mean_terminated_length": 714.0706787109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8732619466197858, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.27450434845069077, + "kl": 0.036773681640625, + "learning_rate": 1.4351172334264756e-07, + "loss": 0.0773, + "num_tokens": 2235817511.0, + "reward": 2.5267858505249023, + "reward_std": 0.46538981795310974, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.13318143784999847, + "step": 4098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 875.1674194335938, + "completions/mean_terminated_length": 740.962646484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8734750412870917, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13123571764869235, + "kl": 0.03033447265625, + "learning_rate": 1.4336822320287284e-07, + "loss": 0.0093, + "num_tokens": 2236276386.0, + "reward": 2.4252233505249023, + "reward_std": 0.36274856328964233, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.10841450095176697, + "step": 4099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 930.5089721679688, + "completions/mean_terminated_length": 764.3179931640625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8736881359543978, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13298803285086572, + "kl": 0.028778076171875, + "learning_rate": 1.4322494810266167e-07, + "loss": 0.0796, + "num_tokens": 2236761606.0, + "reward": 2.4598214626312256, + "reward_std": 0.3826166093349457, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11444751173257828, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1037.40185546875, + "completions/mean_terminated_length": 804.1868286132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8739012306217037, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13875004870749313, + "kl": 0.026153564453125, + "learning_rate": 1.4308189812130572e-07, + "loss": 0.0843, + "num_tokens": 2237287418.0, + "reward": 2.2572546005249023, + "reward_std": 0.4233165681362152, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.18169322609901428, + "step": 4101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 885.4241333007812, + "completions/mean_terminated_length": 719.341796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8741143252890097, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.5584138699317963, + "kl": 0.04290771484375, + "learning_rate": 1.429390733379723e-07, + "loss": 0.0939, + "num_tokens": 2237756776.0, + "reward": 2.4598214626312256, + "reward_std": 0.4481230080127716, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.49405214190483093, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14285963773727417, + "step": 4102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1153.30810546875, + "completions/mean_terminated_length": 844.330322265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8743274199563156, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10554906003292863, + "kl": 0.02264404296875, + "learning_rate": 1.4279647383170387e-07, + "loss": 0.0648, + "num_tokens": 2238340706.0, + "reward": 2.3738839626312256, + "reward_std": 0.37633055448532104, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13814601302146912, + "step": 4103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 903.9219360351562, + "completions/mean_terminated_length": 716.7091064453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8745405146236216, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.18949633495950627, + "kl": 0.032623291015625, + "learning_rate": 1.4265409968141838e-07, + "loss": 0.0799, + "num_tokens": 2238823615.0, + "reward": 2.478794813156128, + "reward_std": 0.3823574185371399, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14160890877246857, + "step": 4104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1122.53125, + "completions/mean_terminated_length": 870.1307373046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8747536092909275, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1233511403486463, + "kl": 0.021728515625, + "learning_rate": 1.4251195096590905e-07, + "loss": 0.0697, + "num_tokens": 2239400349.0, + "reward": 2.3470983505249023, + "reward_std": 0.38672545552253723, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.12332592159509659, + "step": 4105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 945.0670166015625, + "completions/mean_terminated_length": 771.2196655273438, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8749667039582334, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11512383257623415, + "kl": 0.028289794921875, + "learning_rate": 1.4237002776384437e-07, + "loss": 0.0677, + "num_tokens": 2239891019.0, + "reward": 2.459263563156128, + "reward_std": 0.3542226552963257, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1208655834197998, + "step": 4106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1041.21875, + "completions/mean_terminated_length": 828.9783935546875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8751797986255394, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12169230665160817, + "kl": 0.026123046875, + "learning_rate": 1.422283301537679e-07, + "loss": 0.0619, + "num_tokens": 2240424621.0, + "reward": 2.4425225257873535, + "reward_std": 0.4512191116809845, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.143477663397789, + "step": 4107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1024.7991943359375, + "completions/mean_terminated_length": 815.758056640625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8753928932928453, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11807054702197434, + "kl": 0.02667236328125, + "learning_rate": 1.4208685821409839e-07, + "loss": 0.0597, + "num_tokens": 2240949683.0, + "reward": 2.4425225257873535, + "reward_std": 0.40510037541389465, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 4108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1092.640625, + "completions/mean_terminated_length": 900.5442504882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8756059879601513, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12082401118986372, + "kl": 0.023101806640625, + "learning_rate": 1.4194561202312978e-07, + "loss": 0.0987, + "num_tokens": 2241508466.0, + "reward": 2.361607313156128, + "reward_std": 0.4608380198478699, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13058780133724213, + "step": 4109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1022.2254638671875, + "completions/mean_terminated_length": 829.0424194335938, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8758190826274572, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11690834934567863, + "kl": 0.02947998046875, + "learning_rate": 1.4180459165903106e-07, + "loss": 0.0553, + "num_tokens": 2242034503.0, + "reward": 2.45703125, + "reward_std": 0.40497055649757385, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.12992528080940247, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1059.5045166015625, + "completions/mean_terminated_length": 844.6141357421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8760321772947632, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1257085408036062, + "kl": 0.025848388671875, + "learning_rate": 1.4166379719984606e-07, + "loss": 0.1079, + "num_tokens": 2242580041.0, + "reward": 2.5111608505249023, + "reward_std": 0.4596747159957886, + "rewards/accuracy_reward/mean": 0.6342592835426331, + "rewards/accuracy_reward/std": 0.4821956753730774, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13376134634017944, + "step": 4111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1100.2076416015625, + "completions/mean_terminated_length": 861.9357299804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8762452719620691, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11219817615665337, + "kl": 0.023834228515625, + "learning_rate": 1.415232287234939e-07, + "loss": 0.0721, + "num_tokens": 2243145110.0, + "reward": 2.256138563156128, + "reward_std": 0.35672348737716675, + "rewards/accuracy_reward/mean": 0.3370535671710968, + "rewards/accuracy_reward/std": 0.47323182225227356, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 4112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 882.1250610351562, + "completions/mean_terminated_length": 718.9618530273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8764583666293752, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1233745299271287, + "kl": 0.028717041015625, + "learning_rate": 1.413828863077684e-07, + "loss": 0.0555, + "num_tokens": 2243599838.0, + "reward": 2.6199777126312256, + "reward_std": 0.3767786920070648, + "rewards/accuracy_reward/mean": 0.6986607313156128, + "rewards/accuracy_reward/std": 0.4593527019023895, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 4113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1022.96435546875, + "completions/mean_terminated_length": 826.6808471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.876671461296681, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12316234667394567, + "kl": 0.026092529296875, + "learning_rate": 1.4124277003033842e-07, + "loss": 0.0538, + "num_tokens": 2244126798.0, + "reward": 2.325892925262451, + "reward_std": 0.41007548570632935, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12717820703983307, + "step": 4114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 897.7277221679688, + "completions/mean_terminated_length": 743.3873291015625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.876884555963987, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12083076920625277, + "kl": 0.028778076171875, + "learning_rate": 1.4110287996874745e-07, + "loss": 0.0536, + "num_tokens": 2244588804.0, + "reward": 2.5279018878936768, + "reward_std": 0.37047278881073, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12975822389125824, + "step": 4115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1002.997802734375, + "completions/mean_terminated_length": 732.9410400390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.877097650631293, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.125971283419655, + "kl": 0.0242919921875, + "learning_rate": 1.4096321620041396e-07, + "loss": 0.0312, + "num_tokens": 2245110403.0, + "reward": 2.33203125, + "reward_std": 0.40842893719673157, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.13192765414714813, + "step": 4116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 994.8348388671875, + "completions/mean_terminated_length": 776.2533569335938, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8773107452985989, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1512076860260538, + "kl": 0.029266357421875, + "learning_rate": 1.4082377880263138e-07, + "loss": 0.1202, + "num_tokens": 2245626425.0, + "reward": 2.3744421005249023, + "reward_std": 0.47978514432907104, + "rewards/accuracy_reward/mean": 0.5069444179534912, + "rewards/accuracy_reward/std": 0.5005314350128174, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15122966468334198, + "step": 4117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 914.7120971679688, + "completions/mean_terminated_length": 722.3786010742188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8775238399659049, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13758621002695806, + "kl": 0.031005859375, + "learning_rate": 1.4068456785256726e-07, + "loss": 0.0654, + "num_tokens": 2246106008.0, + "reward": 2.5340402126312256, + "reward_std": 0.36230164766311646, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1119314506649971, + "step": 4118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1099.546875, + "completions/mean_terminated_length": 790.8787231445312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8777369346332108, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10518328248094859, + "kl": 0.0234375, + "learning_rate": 1.4054558342726453e-07, + "loss": 0.0199, + "num_tokens": 2246676573.0, + "reward": 2.408482313156128, + "reward_std": 0.3364168405532837, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.10325382649898529, + "step": 4119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 971.8125610351562, + "completions/mean_terminated_length": 751.9462280273438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8779500293005168, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14306732496703112, + "kl": 0.02606201171875, + "learning_rate": 1.404068256036403e-07, + "loss": 0.1091, + "num_tokens": 2247188057.0, + "reward": 2.4698662757873535, + "reward_std": 0.432937890291214, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13911856710910797, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 919.4777221679688, + "completions/mean_terminated_length": 734.8103637695312, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.8781631239678227, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14060157722041228, + "kl": 0.030242919921875, + "learning_rate": 1.4026829445848642e-07, + "loss": 0.0589, + "num_tokens": 2247667215.0, + "reward": 2.490513563156128, + "reward_std": 0.3907771706581116, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13041439652442932, + "step": 4121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1021.732177734375, + "completions/mean_terminated_length": 730.6132202148438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8783762186351286, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1265082713534206, + "kl": 0.024749755859375, + "learning_rate": 1.4012999006846926e-07, + "loss": 0.0808, + "num_tokens": 2248196551.0, + "reward": 2.5200893878936768, + "reward_std": 0.3904174864292145, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 4122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 954.575927734375, + "completions/mean_terminated_length": 814.1107788085938, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8785893133024346, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1455699337163834, + "kl": 0.028228759765625, + "learning_rate": 1.3999191251012964e-07, + "loss": 0.1229, + "num_tokens": 2248694585.0, + "reward": 2.5161831378936768, + "reward_std": 0.46627891063690186, + "rewards/accuracy_reward/mean": 0.6342592835426331, + "rewards/accuracy_reward/std": 0.482195645570755, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1354389488697052, + "step": 4123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 917.1317138671875, + "completions/mean_terminated_length": 790.8560791015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8788024079697405, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11273490131696988, + "kl": 0.02911376953125, + "learning_rate": 1.3985406185988314e-07, + "loss": 0.0853, + "num_tokens": 2249166788.0, + "reward": 2.6729912757873535, + "reward_std": 0.42071303725242615, + "rewards/accuracy_reward/mean": 0.7611607313156128, + "rewards/accuracy_reward/std": 0.4268510043621063, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11589459329843521, + "step": 4124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 951.4442138671875, + "completions/mean_terminated_length": 758.611572265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8790155026370465, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13527557948745145, + "kl": 0.0299072265625, + "learning_rate": 1.397164381940193e-07, + "loss": 0.0958, + "num_tokens": 2249662891.0, + "reward": 2.5189733505249023, + "reward_std": 0.41511550545692444, + "rewards/accuracy_reward/mean": 0.6597222089767456, + "rewards/accuracy_reward/std": 0.47435182332992554, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14160890877246857, + "step": 4125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1010.8795166015625, + "completions/mean_terminated_length": 795.6279907226562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8792285973043524, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13513263123848054, + "kl": 0.02630615234375, + "learning_rate": 1.3957904158870248e-07, + "loss": 0.1093, + "num_tokens": 2250184597.0, + "reward": 2.4681921005249023, + "reward_std": 0.43210306763648987, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1552460640668869, + "step": 4126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 987.2076416015625, + "completions/mean_terminated_length": 794.0818481445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8794416919716584, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12276265016604013, + "kl": 0.027313232421875, + "learning_rate": 1.3944187211997104e-07, + "loss": 0.0701, + "num_tokens": 2250697714.0, + "reward": 2.506138563156128, + "reward_std": 0.4436274468898773, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 4127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 957.30810546875, + "completions/mean_terminated_length": 744.9866333007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8796547866389643, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1754091684887157, + "kl": 0.031951904296875, + "learning_rate": 1.3930492986373784e-07, + "loss": 0.0624, + "num_tokens": 2251198828.0, + "reward": 2.42578125, + "reward_std": 0.4366443455219269, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1208655834197998, + "step": 4128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 962.8125610351562, + "completions/mean_terminated_length": 795.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8798678813062704, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1283879790766306, + "kl": 0.028564453125, + "learning_rate": 1.3916821489578996e-07, + "loss": 0.0447, + "num_tokens": 2251693992.0, + "reward": 2.5050225257873535, + "reward_std": 0.4213351905345917, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517838180065155, + "step": 4129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1091.8951416015625, + "completions/mean_terminated_length": 841.4224853515625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8800809759735763, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1120346139421575, + "kl": 0.024078369140625, + "learning_rate": 1.3903172729178854e-07, + "loss": 0.0512, + "num_tokens": 2252251657.0, + "reward": 2.24609375, + "reward_std": 0.3908710777759552, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.143477663397789, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1133.0023193359375, + "completions/mean_terminated_length": 824.3612060546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8802940706408822, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11183882074357733, + "kl": 0.022552490234375, + "learning_rate": 1.3889546712726924e-07, + "loss": 0.0664, + "num_tokens": 2252830186.0, + "reward": 2.3348214626312256, + "reward_std": 0.31416070461273193, + "rewards/accuracy_reward/mean": 0.3839285671710968, + "rewards/accuracy_reward/std": 0.48688456416130066, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9866071343421936, + "rewards/tag_count_reward/std": 0.09060624986886978, + "step": 4131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 918.3504638671875, + "completions/mean_terminated_length": 753.6700439453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8805071653081882, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.39833456886609653, + "kl": 0.034423828125, + "learning_rate": 1.3875943447764155e-07, + "loss": 0.1072, + "num_tokens": 2253318279.0, + "reward": 2.4285714626312256, + "reward_std": 0.48930060863494873, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.15527118742465973, + "step": 4132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 960.4219360351562, + "completions/mean_terminated_length": 801.8746948242188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8807202599754941, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14112593125284428, + "kl": 0.0313720703125, + "learning_rate": 1.3862362941818894e-07, + "loss": 0.073, + "num_tokens": 2253812020.0, + "reward": 2.5580358505249023, + "reward_std": 0.3961932361125946, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.4776431620121002, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11286582797765732, + "step": 4133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 997.372802734375, + "completions/mean_terminated_length": 779.3180541992188, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8809333546428001, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13669803972692854, + "kl": 0.027801513671875, + "learning_rate": 1.384880520240694e-07, + "loss": 0.099, + "num_tokens": 2254325163.0, + "reward": 2.462611675262451, + "reward_std": 0.4169662594795227, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1199323832988739, + "step": 4134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1008.4754638671875, + "completions/mean_terminated_length": 779.0435791015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.881146449310106, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13605609765479867, + "kl": 0.02728271484375, + "learning_rate": 1.3835270237031439e-07, + "loss": 0.0899, + "num_tokens": 2254847904.0, + "reward": 2.357142925262451, + "reward_std": 0.43272456526756287, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15365473926067352, + "step": 4135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 903.2388916015625, + "completions/mean_terminated_length": 729.61181640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.881359543977412, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12512603042140577, + "kl": 0.033477783203125, + "learning_rate": 1.382175805318299e-07, + "loss": 0.0257, + "num_tokens": 2255313739.0, + "reward": 2.5284600257873535, + "reward_std": 0.32592588663101196, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.11684267967939377, + "step": 4136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1053.341552734375, + "completions/mean_terminated_length": 887.5651245117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8815726386447179, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11587492375591205, + "kl": 0.027923583984375, + "learning_rate": 1.3808268658339506e-07, + "loss": -0.0068, + "num_tokens": 2255856116.0, + "reward": 2.3989956378936768, + "reward_std": 0.3875787556171417, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 4137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1019.638427734375, + "completions/mean_terminated_length": 822.7180786132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8817857333120238, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1492363092681286, + "kl": 0.026397705078125, + "learning_rate": 1.3794802059966378e-07, + "loss": 0.0793, + "num_tokens": 2256385362.0, + "reward": 2.3839287757873535, + "reward_std": 0.44679322838783264, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.1378791779279709, + "step": 4138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 942.1897583007812, + "completions/mean_terminated_length": 761.2389526367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8819988279793298, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12533443158549135, + "kl": 0.029632568359375, + "learning_rate": 1.3781358265516324e-07, + "loss": 0.0596, + "num_tokens": 2256871495.0, + "reward": 2.4693081378936768, + "reward_std": 0.4305974543094635, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1328611671924591, + "step": 4139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1149.6585693359375, + "completions/mean_terminated_length": 864.302978515625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8822119226466357, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12655382202218207, + "kl": 0.024810791015625, + "learning_rate": 1.3767937282429448e-07, + "loss": 0.0883, + "num_tokens": 2257460926.0, + "reward": 2.3113839626312256, + "reward_std": 0.5290858745574951, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1599874496459961, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 903.6428833007812, + "completions/mean_terminated_length": 736.8184204101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8824250173139417, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13653692157954006, + "kl": 0.028839111328125, + "learning_rate": 1.375453911813324e-07, + "loss": 0.0821, + "num_tokens": 2257932878.0, + "reward": 2.4453125, + "reward_std": 0.44600623846054077, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13507550954818726, + "step": 4141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 965.88623046875, + "completions/mean_terminated_length": 792.0751342773438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8826381119812476, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12745522914075622, + "kl": 0.028594970703125, + "learning_rate": 1.374116378004255e-07, + "loss": 0.0366, + "num_tokens": 2258433611.0, + "reward": 2.4799108505249023, + "reward_std": 0.43910205364227295, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.08826113492250443, + "step": 4142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 890.0357666015625, + "completions/mean_terminated_length": 721.2276000976562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8828512066485537, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12926332774022947, + "kl": 0.02838134765625, + "learning_rate": 1.372781127555963e-07, + "loss": 0.0392, + "num_tokens": 2258899371.0, + "reward": 2.5234375, + "reward_std": 0.35832515358924866, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13865114748477936, + "step": 4143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 980.435302734375, + "completions/mean_terminated_length": 789.3973999023438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8830643013158596, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14136847643221295, + "kl": 0.028167724609375, + "learning_rate": 1.3714481612074047e-07, + "loss": 0.0552, + "num_tokens": 2259412910.0, + "reward": 2.29296875, + "reward_std": 0.3875815272331238, + "rewards/accuracy_reward/mean": 0.3950892984867096, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12356231361627579, + "step": 4144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1045.4754638671875, + "completions/mean_terminated_length": 840.6586303710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8832773959831656, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1276031823902063, + "kl": 0.026824951171875, + "learning_rate": 1.3701174796962743e-07, + "loss": 0.0862, + "num_tokens": 2259951667.0, + "reward": 2.404576063156128, + "reward_std": 0.4412045478820801, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1217813789844513, + "step": 4145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1044.4285888671875, + "completions/mean_terminated_length": 802.5706176757812, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8834904906504715, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10287085495815296, + "kl": 0.024322509765625, + "learning_rate": 1.3687890837590044e-07, + "loss": 0.0373, + "num_tokens": 2260490467.0, + "reward": 2.3504464626312256, + "reward_std": 0.3876363933086395, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14336557686328888, + "step": 4146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1032.575927734375, + "completions/mean_terminated_length": 770.1629028320312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8837035853177774, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15021665747205368, + "kl": 0.02618408203125, + "learning_rate": 1.3674629741307594e-07, + "loss": 0.1457, + "num_tokens": 2261015557.0, + "reward": 2.3950893878936768, + "reward_std": 0.509012758731842, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9486607313156128, + "rewards/tag_count_reward/std": 0.1797599494457245, + "step": 4147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 866.9397583007812, + "completions/mean_terminated_length": 708.4683837890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8839166799850834, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13465845816710117, + "kl": 0.03302001953125, + "learning_rate": 1.3661391515454396e-07, + "loss": 0.0526, + "num_tokens": 2261477930.0, + "reward": 2.3431921005249023, + "reward_std": 0.3759476840496063, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1336481273174286, + "step": 4148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1144.904052734375, + "completions/mean_terminated_length": 914.703125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8841297746523893, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12465350130852852, + "kl": 0.0252685546875, + "learning_rate": 1.3648176167356782e-07, + "loss": 0.0648, + "num_tokens": 2262062383.0, + "reward": 2.2918527126312256, + "reward_std": 0.45645463466644287, + "rewards/accuracy_reward/mean": 0.4084821343421936, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14843006432056427, + "step": 4149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1036.2835693359375, + "completions/mean_terminated_length": 788.9750366210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8843428693196953, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1319010865255141, + "kl": 0.02880859375, + "learning_rate": 1.3634983704328457e-07, + "loss": 0.0977, + "num_tokens": 2262595486.0, + "reward": 2.4486608505249023, + "reward_std": 0.4144447445869446, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1605832874774933, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1058.33935546875, + "completions/mean_terminated_length": 799.0760498046875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8845559639870012, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12893099671427674, + "kl": 0.0269775390625, + "learning_rate": 1.3621814133670435e-07, + "loss": 0.0786, + "num_tokens": 2263141558.0, + "reward": 2.3833706378936768, + "reward_std": 0.5027891397476196, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.8928571343421936, + "rewards/format_reward/std": 0.3096405565738678, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.1675894856452942, + "step": 4151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 888.8058471679688, + "completions/mean_terminated_length": 752.940185546875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8847690586543072, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13333443124834937, + "kl": 0.0296630859375, + "learning_rate": 1.3608667462671044e-07, + "loss": 0.0247, + "num_tokens": 2263608431.0, + "reward": 2.5513393878936768, + "reward_std": 0.39516928791999817, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10649171471595764, + "step": 4152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1060.055908203125, + "completions/mean_terminated_length": 832.0687255859375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.8849821533216131, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12767413539345768, + "kl": 0.025848388671875, + "learning_rate": 1.3595543698605988e-07, + "loss": 0.0738, + "num_tokens": 2264155592.0, + "reward": 2.3331475257873535, + "reward_std": 0.5176364779472351, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15332838892936707, + "step": 4153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1064.375, + "completions/mean_terminated_length": 781.72412109375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8851952479889191, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12215680475396902, + "kl": 0.024200439453125, + "learning_rate": 1.3582442848738252e-07, + "loss": 0.1043, + "num_tokens": 2264699040.0, + "reward": 2.4034600257873535, + "reward_std": 0.4074050486087799, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11540208011865616, + "step": 4154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 994.8482666015625, + "completions/mean_terminated_length": 762.40869140625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.885408342656225, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13498585074124453, + "kl": 0.028656005859375, + "learning_rate": 1.3569364920318155e-07, + "loss": 0.1081, + "num_tokens": 2265209804.0, + "reward": 2.3046875, + "reward_std": 0.3655664026737213, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1464626044034958, + "step": 4155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 921.5692138671875, + "completions/mean_terminated_length": 760.6505126953125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.8856214373235309, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13126511488096904, + "kl": 0.030853271484375, + "learning_rate": 1.3556309920583332e-07, + "loss": 0.0309, + "num_tokens": 2265688603.0, + "reward": 2.53515625, + "reward_std": 0.331646203994751, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.13925647735595703, + "step": 4156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1056.62060546875, + "completions/mean_terminated_length": 810.8468017578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.885834531990837, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13179544745997673, + "kl": 0.02508544921875, + "learning_rate": 1.3543277856758712e-07, + "loss": 0.0838, + "num_tokens": 2266231265.0, + "reward": 2.4246652126312256, + "reward_std": 0.3900611996650696, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13064393401145935, + "step": 4157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 965.69873046875, + "completions/mean_terminated_length": 788.5947875976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8860476266581429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11949250115432297, + "kl": 0.028472900390625, + "learning_rate": 1.3530268736056565e-07, + "loss": 0.1004, + "num_tokens": 2266733130.0, + "reward": 2.489955425262451, + "reward_std": 0.33382484316825867, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9854910969734192, + "rewards/tag_count_reward/std": 0.09785954654216766, + "step": 4158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1132.7857666015625, + "completions/mean_terminated_length": 869.7930908203125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8862607213254489, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11266642714720972, + "kl": 0.02276611328125, + "learning_rate": 1.351728256567644e-07, + "loss": 0.0916, + "num_tokens": 2267313290.0, + "reward": 2.3125, + "reward_std": 0.40958693623542786, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.13325640559196472, + "step": 4159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1103.375, + "completions/mean_terminated_length": 795.9526977539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8864738159927548, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11452594457792649, + "kl": 0.026031494140625, + "learning_rate": 1.3504319352805179e-07, + "loss": 0.0482, + "num_tokens": 2267875778.0, + "reward": 2.3560268878936768, + "reward_std": 0.3648747205734253, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1502326875925064, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1005.013427734375, + "completions/mean_terminated_length": 785.1405639648438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8866869106600608, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1314363778744809, + "kl": 0.03143310546875, + "learning_rate": 1.3491379104616938e-07, + "loss": 0.082, + "num_tokens": 2268390472.0, + "reward": 2.4012277126312256, + "reward_std": 0.42521148920059204, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 4161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 949.99560546875, + "completions/mean_terminated_length": 732.7433471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8869000053273667, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14584501560191895, + "kl": 0.033966064453125, + "learning_rate": 1.347846182827314e-07, + "loss": 0.0976, + "num_tokens": 2268881462.0, + "reward": 2.5546875, + "reward_std": 0.4590505361557007, + "rewards/accuracy_reward/mean": 0.6919642686843872, + "rewards/accuracy_reward/std": 0.46219751238822937, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15063102543354034, + "step": 4162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 959.0000610351562, + "completions/mean_terminated_length": 743.5294189453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8871130999946726, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12008285243699954, + "kl": 0.0279541015625, + "learning_rate": 1.3465567530922526e-07, + "loss": 0.047, + "num_tokens": 2269379190.0, + "reward": 2.3800225257873535, + "reward_std": 0.39347752928733826, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1363026201725006, + "step": 4163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 929.6585083007812, + "completions/mean_terminated_length": 763.341064453125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8873261946619786, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13550099736132898, + "kl": 0.0335693359375, + "learning_rate": 1.345269621970108e-07, + "loss": 0.0773, + "num_tokens": 2269862813.0, + "reward": 2.4213171005249023, + "reward_std": 0.4631577134132385, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1277548223733902, + "step": 4164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 999.1183471679688, + "completions/mean_terminated_length": 833.7907104492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8875392893292845, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11426661027477648, + "kl": 0.027313232421875, + "learning_rate": 1.3439847901732116e-07, + "loss": 0.0267, + "num_tokens": 2270381458.0, + "reward": 2.4737725257873535, + "reward_std": 0.42879223823547363, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13064393401145935, + "step": 4165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 988.7277221679688, + "completions/mean_terminated_length": 824.9226684570312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8877523839965905, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1423658838220071, + "kl": 0.031768798828125, + "learning_rate": 1.3427022584126173e-07, + "loss": 0.0705, + "num_tokens": 2270894264.0, + "reward": 2.419642925262451, + "reward_std": 0.4749682545661926, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13989263772964478, + "step": 4166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 813.6517944335938, + "completions/mean_terminated_length": 672.407958984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8879654786638964, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1555839215612168, + "kl": 0.03387451171875, + "learning_rate": 1.341422027398109e-07, + "loss": 0.0946, + "num_tokens": 2271325644.0, + "reward": 2.5245537757873535, + "reward_std": 0.40580102801322937, + "rewards/accuracy_reward/mean": 0.6272321343421936, + "rewards/accuracy_reward/std": 0.4840816557407379, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13376134634017944, + "step": 4167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1005.6406860351562, + "completions/mean_terminated_length": 799.3984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8881785733312024, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12830997387760776, + "kl": 0.027069091796875, + "learning_rate": 1.340144097838197e-07, + "loss": 0.071, + "num_tokens": 2271850923.0, + "reward": 2.4598214626312256, + "reward_std": 0.46587109565734863, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.4929138123989105, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12223286926746368, + "step": 4168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 990.94873046875, + "completions/mean_terminated_length": 801.7921142578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8883916679985083, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1413313435076299, + "kl": 0.026824951171875, + "learning_rate": 1.3388684704401155e-07, + "loss": 0.0847, + "num_tokens": 2272356068.0, + "reward": 2.3470983505249023, + "reward_std": 0.4011780321598053, + "rewards/accuracy_reward/mean": 0.4263392984867096, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709373772144318, + "step": 4169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 912.43310546875, + "completions/mean_terminated_length": 736.8298950195312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8886047626658143, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13113559089731008, + "kl": 0.029266357421875, + "learning_rate": 1.33759514590983e-07, + "loss": 0.0393, + "num_tokens": 2272831542.0, + "reward": 2.502232313156128, + "reward_std": 0.4183984100818634, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12733517587184906, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 984.8170166015625, + "completions/mean_terminated_length": 764.1563110351562, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8888178573331202, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13617647948160563, + "kl": 0.031707763671875, + "learning_rate": 1.3363241249520251e-07, + "loss": 0.0501, + "num_tokens": 2273352500.0, + "reward": 2.385044813156128, + "reward_std": 0.4162093698978424, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1468711644411087, + "step": 4171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 936.8326416015625, + "completions/mean_terminated_length": 720.5253295898438, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8890309520004261, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1398165106656057, + "kl": 0.029998779296875, + "learning_rate": 1.3350554082701155e-07, + "loss": 0.1178, + "num_tokens": 2273838745.0, + "reward": 2.396763563156128, + "reward_std": 0.420979768037796, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14078108966350555, + "step": 4172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 919.7835083007812, + "completions/mean_terminated_length": 738.5673217773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8892440466677322, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11619482855785468, + "kl": 0.03070068359375, + "learning_rate": 1.333788996566238e-07, + "loss": 0.0285, + "num_tokens": 2274314504.0, + "reward": 2.4760046005249023, + "reward_std": 0.414402574300766, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 4173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 900.8973388671875, + "completions/mean_terminated_length": 709.7135620117188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8894571413350381, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15130816759108975, + "kl": 0.030914306640625, + "learning_rate": 1.3325248905412544e-07, + "loss": 0.1496, + "num_tokens": 2274788858.0, + "reward": 2.4224331378936768, + "reward_std": 0.4770428240299225, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9514508843421936, + "rewards/tag_count_reward/std": 0.16217589378356934, + "step": 4174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1084.185302734375, + "completions/mean_terminated_length": 817.8319091796875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8896702360023441, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12034071773916155, + "kl": 0.0262451171875, + "learning_rate": 1.3312630908947514e-07, + "loss": 0.0371, + "num_tokens": 2275342941.0, + "reward": 2.411830425262451, + "reward_std": 0.44236335158348083, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16740036010742188, + "step": 4175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 954.4063110351562, + "completions/mean_terminated_length": 775.4545288085938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.88988333066965, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14857972524808974, + "kl": 0.028778076171875, + "learning_rate": 1.330003598325037e-07, + "loss": 0.1171, + "num_tokens": 2275836819.0, + "reward": 2.5072546005249023, + "reward_std": 0.40550389885902405, + "rewards/accuracy_reward/mean": 0.6087962985038757, + "rewards/accuracy_reward/std": 0.4885856807231903, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12292414903640747, + "step": 4176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1038.9442138671875, + "completions/mean_terminated_length": 845.720703125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.890096425336956, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13286117642704873, + "kl": 0.027435302734375, + "learning_rate": 1.3287464135291456e-07, + "loss": 0.0837, + "num_tokens": 2276380602.0, + "reward": 2.458705425262451, + "reward_std": 0.47483500838279724, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15247619152069092, + "step": 4177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1104.9910888671875, + "completions/mean_terminated_length": 861.2921142578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8903095200042619, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12262046174993887, + "kl": 0.0257568359375, + "learning_rate": 1.3274915372028317e-07, + "loss": 0.0722, + "num_tokens": 2276950390.0, + "reward": 2.478236675262451, + "reward_std": 0.47717219591140747, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14732414484024048, + "step": 4178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1044.727783203125, + "completions/mean_terminated_length": 842.9973754882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8905226146715678, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 1.0667453980886112, + "kl": 0.122467041015625, + "learning_rate": 1.3262389700405743e-07, + "loss": 0.0924, + "num_tokens": 2277492076.0, + "reward": 2.431919813156128, + "reward_std": 0.4180252254009247, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13661938905715942, + "step": 4179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1017.013427734375, + "completions/mean_terminated_length": 832.5211181640625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8907357093388738, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1278579701484868, + "kl": 0.02581787109375, + "learning_rate": 1.3249887127355721e-07, + "loss": 0.093, + "num_tokens": 2278020962.0, + "reward": 2.4073662757873535, + "reward_std": 0.41762349009513855, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09892533719539642, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1108.69873046875, + "completions/mean_terminated_length": 855.9121704101562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8909488040061797, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.18512011249742813, + "kl": 0.030548095703125, + "learning_rate": 1.3237407659797485e-07, + "loss": 0.0576, + "num_tokens": 2278586555.0, + "reward": 2.3833706378936768, + "reward_std": 0.45720839500427246, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15488377213478088, + "step": 4181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1070.859375, + "completions/mean_terminated_length": 838.7210083007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8911618986734857, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1232918356046186, + "kl": 0.025421142578125, + "learning_rate": 1.3224951304637446e-07, + "loss": 0.0733, + "num_tokens": 2279142396.0, + "reward": 2.5418527126312256, + "reward_std": 0.4559517800807953, + "rewards/accuracy_reward/mean": 0.6339285969734192, + "rewards/accuracy_reward/std": 0.482267826795578, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13469025492668152, + "step": 4182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1097.2054443359375, + "completions/mean_terminated_length": 834.4501342773438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8913749933407916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12874870656372073, + "kl": 0.027496337890625, + "learning_rate": 1.321251806876925e-07, + "loss": 0.0459, + "num_tokens": 2279704552.0, + "reward": 2.4129464626312256, + "reward_std": 0.5138823986053467, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.15647254884243011, + "step": 4183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 927.0870971679688, + "completions/mean_terminated_length": 715.9867553710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8915880880080976, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1365260904839435, + "kl": 0.027984619140625, + "learning_rate": 1.3200107959073752e-07, + "loss": 0.08, + "num_tokens": 2280194831.0, + "reward": 2.2310268878936768, + "reward_std": 0.4414485692977905, + "rewards/accuracy_reward/mean": 0.3325892984867096, + "rewards/accuracy_reward/std": 0.47166749835014343, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.146973118185997, + "step": 4184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1041.4420166015625, + "completions/mean_terminated_length": 795.3944702148438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8918011826754035, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14192262125918254, + "kl": 0.02801513671875, + "learning_rate": 1.3187720982418993e-07, + "loss": 0.1003, + "num_tokens": 2280728101.0, + "reward": 2.421875, + "reward_std": 0.5066932439804077, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.18345820903778076, + "step": 4185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1152.946533203125, + "completions/mean_terminated_length": 843.8438720703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8920142773427095, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12033127255993122, + "kl": 0.023101806640625, + "learning_rate": 1.3175357145660204e-07, + "loss": 0.0813, + "num_tokens": 2281316733.0, + "reward": 2.4263393878936768, + "reward_std": 0.431247740983963, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11286582797765732, + "step": 4186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1077.997802734375, + "completions/mean_terminated_length": 827.3230590820312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8922273720100155, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13569091351186136, + "kl": 0.026092529296875, + "learning_rate": 1.3163016455639832e-07, + "loss": 0.1356, + "num_tokens": 2281872172.0, + "reward": 2.3443081378936768, + "reward_std": 0.513056755065918, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.17603272199630737, + "step": 4187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1031.9598388671875, + "completions/mean_terminated_length": 817.767578125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8924404666773214, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11088941502325582, + "kl": 0.025634765625, + "learning_rate": 1.3150698919187504e-07, + "loss": 0.0584, + "num_tokens": 2282401114.0, + "reward": 2.396763563156128, + "reward_std": 0.4460963010787964, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1372518688440323, + "step": 4188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1115.3482666015625, + "completions/mean_terminated_length": 860.9886474609375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8926535613446274, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11464826896376981, + "kl": 0.023590087890625, + "learning_rate": 1.313840454312004e-07, + "loss": 0.0708, + "num_tokens": 2282972374.0, + "reward": 2.2611608505249023, + "reward_std": 0.44804778695106506, + "rewards/accuracy_reward/mean": 0.3861607015132904, + "rewards/accuracy_reward/std": 0.4874124228954315, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17468811571598053, + "step": 4189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1154.46435546875, + "completions/mean_terminated_length": 827.5609741210938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8928666560119333, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.29259013530090344, + "kl": 0.025634765625, + "learning_rate": 1.3126133334241417e-07, + "loss": 0.1053, + "num_tokens": 2283564054.0, + "reward": 2.217076063156128, + "reward_std": 0.4451143741607666, + "rewards/accuracy_reward/mean": 0.3147321343421936, + "rewards/accuracy_reward/std": 0.4649282693862915, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15332838892936707, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 925.8035888671875, + "completions/mean_terminated_length": 731.916259765625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8930797506792393, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.27182220454498246, + "kl": 0.03485107421875, + "learning_rate": 1.3113885299342834e-07, + "loss": 0.1074, + "num_tokens": 2284042638.0, + "reward": 2.51171875, + "reward_std": 0.40785545110702515, + "rewards/accuracy_reward/mean": 0.5982142686843872, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1258051097393036, + "step": 4191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 966.4129638671875, + "completions/mean_terminated_length": 786.1484375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.8932928453465452, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1418838796836582, + "kl": 0.02880859375, + "learning_rate": 1.3101660445202623e-07, + "loss": 0.093, + "num_tokens": 2284545559.0, + "reward": 2.5005581378936768, + "reward_std": 0.4155113101005554, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561822891235352, + "step": 4192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 973.4152221679688, + "completions/mean_terminated_length": 714.4431762695312, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.8935059400138512, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12938872114589273, + "kl": 0.02685546875, + "learning_rate": 1.3089458778586318e-07, + "loss": 0.0599, + "num_tokens": 2285046257.0, + "reward": 2.421875, + "reward_std": 0.3593243658542633, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11321921646595001, + "step": 4193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1034.571533203125, + "completions/mean_terminated_length": 850.0686645507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8937190346811571, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11765186524065456, + "kl": 0.026153564453125, + "learning_rate": 1.3077280306246593e-07, + "loss": 0.0601, + "num_tokens": 2285580977.0, + "reward": 2.46484375, + "reward_std": 0.47264814376831055, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11801211535930634, + "step": 4194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 974.0045166015625, + "completions/mean_terminated_length": 814.2821044921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.893932129348463, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13825517255564443, + "kl": 0.02789306640625, + "learning_rate": 1.3065125034923304e-07, + "loss": 0.0749, + "num_tokens": 2286081107.0, + "reward": 2.52734375, + "reward_std": 0.48276105523109436, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.8861607313156128, + "rewards/format_reward/std": 0.31797102093696594, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.12802813947200775, + "step": 4195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1046.446533203125, + "completions/mean_terminated_length": 794.6591796875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.894145224015769, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12304409933832433, + "kl": 0.024383544921875, + "learning_rate": 1.3052992971343486e-07, + "loss": 0.1059, + "num_tokens": 2286624107.0, + "reward": 2.41015625, + "reward_std": 0.4155735373497009, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 4196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1096.185302734375, + "completions/mean_terminated_length": 870.0635986328125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8943583186830749, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13258420481586983, + "kl": 0.025787353515625, + "learning_rate": 1.3040884122221276e-07, + "loss": 0.0356, + "num_tokens": 2287188590.0, + "reward": 2.204799175262451, + "reward_std": 0.43173640966415405, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 4197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 896.154052734375, + "completions/mean_terminated_length": 653.3324584960938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8945714133503809, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.14441045907185743, + "kl": 0.0303955078125, + "learning_rate": 1.3028798494258004e-07, + "loss": 0.0779, + "num_tokens": 2287658419.0, + "reward": 2.4921875, + "reward_std": 0.2945016026496887, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11141301691532135, + "step": 4198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 969.8973388671875, + "completions/mean_terminated_length": 793.4805297851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8947845080176868, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12714605252413247, + "kl": 0.027099609375, + "learning_rate": 1.301673609414215e-07, + "loss": 0.0551, + "num_tokens": 2288160101.0, + "reward": 2.53515625, + "reward_std": 0.39811643958091736, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.09791535139083862, + "step": 4199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1040.462158203125, + "completions/mean_terminated_length": 853.8809204101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8949976026849928, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1390015289080822, + "kl": 0.02569580078125, + "learning_rate": 1.3004696928549322e-07, + "loss": 0.1007, + "num_tokens": 2288699492.0, + "reward": 2.4441964626312256, + "reward_std": 0.4293304979801178, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.225421741604805, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13376134634017944, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 938.15185546875, + "completions/mean_terminated_length": 838.2384643554688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8952106973522987, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11586176869558541, + "kl": 0.029266357421875, + "learning_rate": 1.2992681004142276e-07, + "loss": 0.0195, + "num_tokens": 2289188232.0, + "reward": 2.517857313156128, + "reward_std": 0.31761831045150757, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10383255779743195, + "step": 4201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 989.9397583007812, + "completions/mean_terminated_length": 783.9706420898438, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.8954237920196048, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1388401179758574, + "kl": 0.028717041015625, + "learning_rate": 1.2980688327570906e-07, + "loss": 0.0926, + "num_tokens": 2289702941.0, + "reward": 2.3839287757873535, + "reward_std": 0.4524340033531189, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14767222106456757, + "step": 4202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1012.341552734375, + "completions/mean_terminated_length": 773.3434448242188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8956368866869107, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12290577045198806, + "kl": 0.02691650390625, + "learning_rate": 1.2968718905472242e-07, + "loss": 0.0618, + "num_tokens": 2290223958.0, + "reward": 2.467076063156128, + "reward_std": 0.4275207817554474, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151521027088165, + "step": 4203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 947.2098388671875, + "completions/mean_terminated_length": 760.3916625976562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8958499813542166, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13044974650150393, + "kl": 0.029541015625, + "learning_rate": 1.2956772744470455e-07, + "loss": 0.029, + "num_tokens": 2290714692.0, + "reward": 2.4877233505249023, + "reward_std": 0.4086536169052124, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.09564017504453659, + "step": 4204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 740.0067138671875, + "completions/mean_terminated_length": 625.7160034179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8960630760215226, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15158626577138942, + "kl": 0.03448486328125, + "learning_rate": 1.2944849851176803e-07, + "loss": 0.0345, + "num_tokens": 2291109543.0, + "reward": 2.5357143878936768, + "reward_std": 0.40671274065971375, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11652304977178574, + "step": 4205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 962.8995971679688, + "completions/mean_terminated_length": 782.0494995117188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8962761706888285, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12247905169907082, + "kl": 0.026519775390625, + "learning_rate": 1.2932950232189722e-07, + "loss": 0.034, + "num_tokens": 2291608138.0, + "reward": 2.4168527126312256, + "reward_std": 0.42413970828056335, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14945264160633087, + "step": 4206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1094.384033203125, + "completions/mean_terminated_length": 847.94384765625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8964892653561345, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1214223562153993, + "kl": 0.02349853515625, + "learning_rate": 1.292107389409473e-07, + "loss": 0.0653, + "num_tokens": 2292170886.0, + "reward": 2.3878350257873535, + "reward_std": 0.45826229453086853, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13519908487796783, + "step": 4207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1057.1160888671875, + "completions/mean_terminated_length": 808.0111694335938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8967023600234404, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1152694385859775, + "kl": 0.027252197265625, + "learning_rate": 1.290922084346447e-07, + "loss": 0.0516, + "num_tokens": 2292720426.0, + "reward": 2.3839287757873535, + "reward_std": 0.4220777750015259, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10649171471595764, + "step": 4208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1043.8326416015625, + "completions/mean_terminated_length": 801.8309936523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8969154546907464, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.22215838622224887, + "kl": 0.032562255859375, + "learning_rate": 1.289739108685869e-07, + "loss": 0.0555, + "num_tokens": 2293261375.0, + "reward": 2.4190850257873535, + "reward_std": 0.40214914083480835, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14027473330497742, + "step": 4209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1113.638427734375, + "completions/mean_terminated_length": 813.2094116210938, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8971285493580523, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11475254776176785, + "kl": 0.022491455078125, + "learning_rate": 1.2885584630824267e-07, + "loss": 0.0438, + "num_tokens": 2293826573.0, + "reward": 2.2667412757873535, + "reward_std": 0.38323456048965454, + "rewards/accuracy_reward/mean": 0.3549107015132904, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15391045808792114, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 990.3326416015625, + "completions/mean_terminated_length": 770.8167114257812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.8973416440253583, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.145933542545102, + "kl": 0.027740478515625, + "learning_rate": 1.2873801481895168e-07, + "loss": 0.0918, + "num_tokens": 2294347810.0, + "reward": 2.4698662757873535, + "reward_std": 0.4441182613372803, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15645259618759155, + "step": 4211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1084.5692138671875, + "completions/mean_terminated_length": 832.1774291992188, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.8975547386926642, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14802286626860592, + "kl": 0.028717041015625, + "learning_rate": 1.286204164659247e-07, + "loss": 0.103, + "num_tokens": 2294903041.0, + "reward": 2.3973214626312256, + "reward_std": 0.42042165994644165, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13888955116271973, + "step": 4212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 967.4063110351562, + "completions/mean_terminated_length": 767.2962646484375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8977678333599701, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14647362729737298, + "kl": 0.028167724609375, + "learning_rate": 1.2850305131424326e-07, + "loss": 0.0867, + "num_tokens": 2295400615.0, + "reward": 2.3895089626312256, + "reward_std": 0.42311400175094604, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1421017199754715, + "step": 4213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 942.1406860351562, + "completions/mean_terminated_length": 793.759521484375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8979809280272761, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13343624231106255, + "kl": 0.02838134765625, + "learning_rate": 1.2838591942886003e-07, + "loss": 0.0815, + "num_tokens": 2295892534.0, + "reward": 2.5708706378936768, + "reward_std": 0.4099407494068146, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14321638643741608, + "step": 4214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1033.32373046875, + "completions/mean_terminated_length": 839.02392578125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.898194022694582, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12198398701812983, + "kl": 0.02728271484375, + "learning_rate": 1.2826902087459878e-07, + "loss": 0.0769, + "num_tokens": 2296426871.0, + "reward": 2.470424175262451, + "reward_std": 0.4267212748527527, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13254131376743317, + "step": 4215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1004.6406860351562, + "completions/mean_terminated_length": 837.0543823242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.898407117361888, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13228505168562116, + "kl": 0.030029296875, + "learning_rate": 1.281523557161536e-07, + "loss": 0.0957, + "num_tokens": 2296950182.0, + "reward": 2.39453125, + "reward_std": 0.44787293672561646, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.16029441356658936, + "step": 4216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 898.6250610351562, + "completions/mean_terminated_length": 682.1644287109375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.898620212029194, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14147386848775725, + "kl": 0.030609130859375, + "learning_rate": 1.280359240180898e-07, + "loss": 0.0806, + "num_tokens": 2297423646.0, + "reward": 2.4921875, + "reward_std": 0.4439813196659088, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.17418356239795685, + "step": 4217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 939.2366333007812, + "completions/mean_terminated_length": 733.9100341796875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8988333066965, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.130985492853666, + "kl": 0.0306396484375, + "learning_rate": 1.2791972584484353e-07, + "loss": 0.0597, + "num_tokens": 2297914568.0, + "reward": 2.458705425262451, + "reward_std": 0.4370371401309967, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14160890877246857, + "step": 4218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 878.1629638671875, + "completions/mean_terminated_length": 741.0499267578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8990464013638059, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12699886854049647, + "kl": 0.0333251953125, + "learning_rate": 1.2780376126072154e-07, + "loss": 0.0773, + "num_tokens": 2298379569.0, + "reward": 2.6607143878936768, + "reward_std": 0.4718056619167328, + "rewards/accuracy_reward/mean": 0.765625, + "rewards/accuracy_reward/std": 0.42408111691474915, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12399725615978241, + "step": 4219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1017.4085083007812, + "completions/mean_terminated_length": 769.0387573242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8992594960311118, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1489353905178799, + "kl": 0.029541015625, + "learning_rate": 1.2768803032990128e-07, + "loss": 0.0937, + "num_tokens": 2298903352.0, + "reward": 2.41015625, + "reward_std": 0.4729800820350647, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.17231273651123047, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1004.57373046875, + "completions/mean_terminated_length": 827.4908447265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8994725906984178, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.138638854902198, + "kl": 0.029449462890625, + "learning_rate": 1.27572533116431e-07, + "loss": 0.0541, + "num_tokens": 2299436633.0, + "reward": 2.4140625, + "reward_std": 0.47697728872299194, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15063102543354034, + "step": 4221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 956.1785888671875, + "completions/mean_terminated_length": 784.0827026367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8996856853657237, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1303104478676641, + "kl": 0.02838134765625, + "learning_rate": 1.2745726968422956e-07, + "loss": 0.0699, + "num_tokens": 2299927321.0, + "reward": 2.6294643878936768, + "reward_std": 0.4414198398590088, + "rewards/accuracy_reward/mean": 0.6986607313156128, + "rewards/accuracy_reward/std": 0.4593527019023895, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12672585248947144, + "step": 4222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1093.5, + "completions/mean_terminated_length": 826.239990234375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8998987800330297, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11705117625360466, + "kl": 0.025665283203125, + "learning_rate": 1.2734224009708657e-07, + "loss": 0.0766, + "num_tokens": 2300492041.0, + "reward": 2.30078125, + "reward_std": 0.44247496128082275, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1579248607158661, + "step": 4223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 991.7701416015625, + "completions/mean_terminated_length": 825.2842407226562, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9001118747003356, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12788168234712893, + "kl": 0.026031494140625, + "learning_rate": 1.2722744441866184e-07, + "loss": 0.0829, + "num_tokens": 2301008386.0, + "reward": 2.4681921005249023, + "reward_std": 0.3792712390422821, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12359262257814407, + "step": 4224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 957.60498046875, + "completions/mean_terminated_length": 720.5625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9003249693676416, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1394535427903369, + "kl": 0.030242919921875, + "learning_rate": 1.2711288271248621e-07, + "loss": 0.1288, + "num_tokens": 2301505361.0, + "reward": 2.388392925262451, + "reward_std": 0.4781411290168762, + "rewards/accuracy_reward/mean": 0.5115740895271301, + "rewards/accuracy_reward/std": 0.5004456043243408, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15274205803871155, + "step": 4225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1188.97998046875, + "completions/mean_terminated_length": 874.7042236328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9005380640349475, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11048458935527253, + "kl": 0.0238037109375, + "learning_rate": 1.2699855504196075e-07, + "loss": 0.0613, + "num_tokens": 2302103448.0, + "reward": 2.3699777126312256, + "reward_std": 0.4487333595752716, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16516588628292084, + "step": 4226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1006.6495971679688, + "completions/mean_terminated_length": 810.5331420898438, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9007511587022535, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14691857776548145, + "kl": 0.029876708984375, + "learning_rate": 1.268844614703571e-07, + "loss": 0.0902, + "num_tokens": 2302619339.0, + "reward": 2.532924175262451, + "reward_std": 0.4915497601032257, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.13675068318843842, + "step": 4227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 908.904052734375, + "completions/mean_terminated_length": 759.3257446289062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9009642533695594, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.2034812647119121, + "kl": 0.03277587890625, + "learning_rate": 1.2677060206081726e-07, + "loss": 0.0617, + "num_tokens": 2303088880.0, + "reward": 2.423549175262451, + "reward_std": 0.34090951085090637, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11702418327331543, + "step": 4228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 861.5826416015625, + "completions/mean_terminated_length": 698.9771728515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9011773480368653, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13046261929793862, + "kl": 0.033233642578125, + "learning_rate": 1.2665697687635375e-07, + "loss": 0.0445, + "num_tokens": 2303538789.0, + "reward": 2.494419813156128, + "reward_std": 0.3952323794364929, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.12485509365797043, + "step": 4229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 881.5803833007812, + "completions/mean_terminated_length": 711.5396118164062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9013904427041713, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12463176794736495, + "kl": 0.028564453125, + "learning_rate": 1.2654358597984938e-07, + "loss": 0.0381, + "num_tokens": 2304001001.0, + "reward": 2.556919813156128, + "reward_std": 0.34749072790145874, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.4852356016635895, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12062390148639679, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1037.357177734375, + "completions/mean_terminated_length": 824.302734375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9016035373714772, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1265952066009801, + "kl": 0.026611328125, + "learning_rate": 1.2643042943405734e-07, + "loss": 0.089, + "num_tokens": 2304534969.0, + "reward": 2.517857313156128, + "reward_std": 0.44530680775642395, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 4231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1054.921875, + "completions/mean_terminated_length": 815.5927734375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9018166320387833, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1318053868713785, + "kl": 0.02850341796875, + "learning_rate": 1.2631750730160113e-07, + "loss": 0.1109, + "num_tokens": 2305071254.0, + "reward": 2.43359375, + "reward_std": 0.4442411959171295, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14294590055942535, + "step": 4232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 877.4219360351562, + "completions/mean_terminated_length": 733.6666870117188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9020297267060892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1190393386220355, + "kl": 0.031097412109375, + "learning_rate": 1.262048196449745e-07, + "loss": 0.0448, + "num_tokens": 2305537315.0, + "reward": 2.4715402126312256, + "reward_std": 0.32439255714416504, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09811913222074509, + "step": 4233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1020.6183471679688, + "completions/mean_terminated_length": 773.0221557617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9022428213733952, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13664299771269292, + "kl": 0.025970458984375, + "learning_rate": 1.2609236652654143e-07, + "loss": 0.1146, + "num_tokens": 2306062968.0, + "reward": 2.4481027126312256, + "reward_std": 0.45905932784080505, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.15174883604049683, + "step": 4234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 858.1741333007812, + "completions/mean_terminated_length": 763.5614624023438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9024559160407011, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1271575627244067, + "kl": 0.03289794921875, + "learning_rate": 1.2598014800853616e-07, + "loss": 0.0141, + "num_tokens": 2306518950.0, + "reward": 2.4994421005249023, + "reward_std": 0.4050199091434479, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1372518688440323, + "step": 4235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 935.5647583007812, + "completions/mean_terminated_length": 715.4572143554688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.902669010708007, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14287386451259723, + "kl": 0.028564453125, + "learning_rate": 1.2586816415306294e-07, + "loss": 0.0953, + "num_tokens": 2307005299.0, + "reward": 2.427455425262451, + "reward_std": 0.448410302400589, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396106719971, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13661938905715942, + "step": 4236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 974.5000610351562, + "completions/mean_terminated_length": 762.0962524414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.902882105375313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1327412533063871, + "kl": 0.027435302734375, + "learning_rate": 1.2575641502209642e-07, + "loss": 0.0703, + "num_tokens": 2307507363.0, + "reward": 2.404576063156128, + "reward_std": 0.4565046727657318, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15764793753623962, + "step": 4237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 907.107177734375, + "completions/mean_terminated_length": 740.7877197265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9030952000426189, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14755042340416133, + "kl": 0.0318603515625, + "learning_rate": 1.2564490067748103e-07, + "loss": 0.1489, + "num_tokens": 2307980611.0, + "reward": 2.431919813156128, + "reward_std": 0.5226327180862427, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.1676388382911682, + "step": 4238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 905.169677734375, + "completions/mean_terminated_length": 748.5380859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9033082947099249, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1343186060310149, + "kl": 0.02880859375, + "learning_rate": 1.255336211809316e-07, + "loss": 0.0577, + "num_tokens": 2308454927.0, + "reward": 2.4213171005249023, + "reward_std": 0.41429373621940613, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11371295899152756, + "step": 4239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1082.305908203125, + "completions/mean_terminated_length": 832.744384765625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9035213893772308, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3314695454139338, + "kl": 0.02947998046875, + "learning_rate": 1.2542257659403267e-07, + "loss": 0.0599, + "num_tokens": 2309013784.0, + "reward": 2.4140625, + "reward_std": 0.4254540801048279, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1286761611700058, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1055.368408203125, + "completions/mean_terminated_length": 849.3504028320312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9037344840445368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14433420910004904, + "kl": 0.028289794921875, + "learning_rate": 1.2531176697823886e-07, + "loss": 0.0826, + "num_tokens": 2309554301.0, + "reward": 2.3130581378936768, + "reward_std": 0.5043104887008667, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14921022951602936, + "step": 4241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1008.185302734375, + "completions/mean_terminated_length": 771.7342529296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9039475787118427, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13982994218370837, + "kl": 0.0263671875, + "learning_rate": 1.252011923948751e-07, + "loss": 0.111, + "num_tokens": 2310075376.0, + "reward": 2.4107143878936768, + "reward_std": 0.4051538109779358, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.16231535375118256, + "step": 4242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 982.74560546875, + "completions/mean_terminated_length": 795.4172973632812, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9041606733791487, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11036421730209296, + "kl": 0.0260009765625, + "learning_rate": 1.2509085290513563e-07, + "loss": 0.048, + "num_tokens": 2310584558.0, + "reward": 2.4347100257873535, + "reward_std": 0.3274381756782532, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10962118953466415, + "step": 4243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1092.8482666015625, + "completions/mean_terminated_length": 846.01123046875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9043737680464546, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11674024338207285, + "kl": 0.02490234375, + "learning_rate": 1.249807485700851e-07, + "loss": 0.0632, + "num_tokens": 2311154058.0, + "reward": 2.38671875, + "reward_std": 0.3895522654056549, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13748814165592194, + "step": 4244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 939.6495971679688, + "completions/mean_terminated_length": 771.5449829101562, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9045868627137605, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13673097248726682, + "kl": 0.029754638671875, + "learning_rate": 1.248708794506578e-07, + "loss": 0.0699, + "num_tokens": 2311642765.0, + "reward": 2.421875, + "reward_std": 0.3794919550418854, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09024729579687119, + "step": 4245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 959.8839721679688, + "completions/mean_terminated_length": 781.8285522460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9047999573810666, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14706659730544427, + "kl": 0.029052734375, + "learning_rate": 1.2476124560765787e-07, + "loss": 0.1089, + "num_tokens": 2312143897.0, + "reward": 2.4955358505249023, + "reward_std": 0.5117605328559875, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.167940154671669, + "step": 4246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1013.04248046875, + "completions/mean_terminated_length": 798.2398681640625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9050130520483725, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11162120965158873, + "kl": 0.02569580078125, + "learning_rate": 1.2465184710175923e-07, + "loss": 0.0904, + "num_tokens": 2312660572.0, + "reward": 2.521205425262451, + "reward_std": 0.3844025731086731, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.216333270072937, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14505796134471893, + "step": 4247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1138.9598388671875, + "completions/mean_terminated_length": 904.039306640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9052261467156785, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.10849229028488235, + "kl": 0.022613525390625, + "learning_rate": 1.2454268399350553e-07, + "loss": 0.0655, + "num_tokens": 2313240586.0, + "reward": 2.330357313156128, + "reward_std": 0.44210320711135864, + "rewards/accuracy_reward/mean": 0.4652777910232544, + "rewards/accuracy_reward/std": 0.499371200799942, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14187753200531006, + "step": 4248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 953.279052734375, + "completions/mean_terminated_length": 796.8903198242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9054392413829844, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12638388795980343, + "kl": 0.026763916015625, + "learning_rate": 1.2443375634331032e-07, + "loss": 0.0769, + "num_tokens": 2313739751.0, + "reward": 2.490513563156128, + "reward_std": 0.4028940498828888, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 4249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 920.4531860351562, + "completions/mean_terminated_length": 718.6815795898438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9056523360502904, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13766801273237217, + "kl": 0.03131103515625, + "learning_rate": 1.2432506421145674e-07, + "loss": 0.0819, + "num_tokens": 2314216978.0, + "reward": 2.548549175262451, + "reward_std": 0.44123563170433044, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12315750867128372, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1052.3460693359375, + "completions/mean_terminated_length": 839.184326171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9058654307175963, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11674170474022959, + "kl": 0.024322509765625, + "learning_rate": 1.2421660765809736e-07, + "loss": 0.0246, + "num_tokens": 2314755357.0, + "reward": 2.3565850257873535, + "reward_std": 0.3697391450405121, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12359261512756348, + "step": 4251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1059.435302734375, + "completions/mean_terminated_length": 768.0086669921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9060785253849022, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11981847439064472, + "kl": 0.025146484375, + "learning_rate": 1.2410838674325472e-07, + "loss": 0.0398, + "num_tokens": 2315309600.0, + "reward": 2.3392858505249023, + "reward_std": 0.40727052092552185, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12782442569732666, + "step": 4252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 875.1830444335938, + "completions/mean_terminated_length": 769.6009521484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9062916200522082, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14090586009353856, + "kl": 0.031219482421875, + "learning_rate": 1.2400040152682085e-07, + "loss": 0.1263, + "num_tokens": 2315763826.0, + "reward": 2.5541296005249023, + "reward_std": 0.38873523473739624, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835129737854, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11440251022577286, + "step": 4253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 923.0156860351562, + "completions/mean_terminated_length": 775.2904052734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9065047147195141, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12553222374701267, + "kl": 0.032135009765625, + "learning_rate": 1.238926520685572e-07, + "loss": 0.0282, + "num_tokens": 2316244265.0, + "reward": 2.5708706378936768, + "reward_std": 0.4033043086528778, + "rewards/accuracy_reward/mean": 0.6741071343421936, + "rewards/accuracy_reward/std": 0.4692314565181732, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13311463594436646, + "step": 4254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 891.4375610351562, + "completions/mean_terminated_length": 698.6771240234375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9067178093868201, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13926558369163775, + "kl": 0.0289306640625, + "learning_rate": 1.2378513842809484e-07, + "loss": 0.0684, + "num_tokens": 2316710173.0, + "reward": 2.4994421005249023, + "reward_std": 0.3587874174118042, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692909002304, + "step": 4255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1053.6785888671875, + "completions/mean_terminated_length": 810.6222534179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.906930904054126, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13950988882714518, + "kl": 0.027984619140625, + "learning_rate": 1.2367786066493447e-07, + "loss": 0.0793, + "num_tokens": 2317259133.0, + "reward": 2.2924108505249023, + "reward_std": 0.49396976828575134, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16516682505607605, + "step": 4256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1084.0, + "completions/mean_terminated_length": 861.5385131835938, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.907143998721432, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12308345998433272, + "kl": 0.023468017578125, + "learning_rate": 1.235708188384461e-07, + "loss": 0.0848, + "num_tokens": 2317818477.0, + "reward": 2.392299175262451, + "reward_std": 0.45165103673934937, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13519908487796783, + "step": 4257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1014.6808471679688, + "completions/mean_terminated_length": 826.5567626953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9073570933887379, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12103463382609936, + "kl": 0.027069091796875, + "learning_rate": 1.2346401300786898e-07, + "loss": 0.0657, + "num_tokens": 2318347150.0, + "reward": 2.4765625, + "reward_std": 0.4353918135166168, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1259699910879135, + "step": 4258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 890.5803833007812, + "completions/mean_terminated_length": 735.281005859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.907570188056044, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12506846704969493, + "kl": 0.029296875, + "learning_rate": 1.2335744323231218e-07, + "loss": 0.0792, + "num_tokens": 2318811570.0, + "reward": 2.5385046005249023, + "reward_std": 0.39858993887901306, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.11946304887533188, + "step": 4259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 921.0245971679688, + "completions/mean_terminated_length": 760.0280151367188, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9077832827233498, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15408457322376723, + "kl": 0.029937744140625, + "learning_rate": 1.2325110957075382e-07, + "loss": 0.0828, + "num_tokens": 2319288493.0, + "reward": 2.5111608505249023, + "reward_std": 0.449688583612442, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10152243822813034, + "step": 4260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1021.9576416015625, + "completions/mean_terminated_length": 825.4813842773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9079963773906558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14398715293030606, + "kl": 0.026763916015625, + "learning_rate": 1.2314501208204163e-07, + "loss": 0.0568, + "num_tokens": 2319809098.0, + "reward": 2.3822546005249023, + "reward_std": 0.48828285932540894, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15371057391166687, + "step": 4261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1070.4576416015625, + "completions/mean_terminated_length": 844.8709106445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9082094720579618, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12148863810616908, + "kl": 0.025634765625, + "learning_rate": 1.2303915082489212e-07, + "loss": 0.0747, + "num_tokens": 2320353831.0, + "reward": 2.3950893878936768, + "reward_std": 0.4405044913291931, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14088857173919678, + "step": 4262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 961.560302734375, + "completions/mean_terminated_length": 767.144775390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9084225667252677, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14721812106006513, + "kl": 0.031494140625, + "learning_rate": 1.229335258578916e-07, + "loss": 0.0856, + "num_tokens": 2320859458.0, + "reward": 2.4447546005249023, + "reward_std": 0.4358256459236145, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1240563914179802, + "step": 4263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1003.2366333007812, + "completions/mean_terminated_length": 786.39892578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9086356613925737, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13401333322048226, + "kl": 0.030853271484375, + "learning_rate": 1.2282813723949535e-07, + "loss": 0.0549, + "num_tokens": 2321377452.0, + "reward": 2.3973214626312256, + "reward_std": 0.44694262742996216, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.15583296120166779, + "step": 4264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1005.6785888671875, + "completions/mean_terminated_length": 732.6196899414062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9088487560598796, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14578539442791744, + "kl": 0.02801513671875, + "learning_rate": 1.2272298502802798e-07, + "loss": 0.092, + "num_tokens": 2321897644.0, + "reward": 2.4190850257873535, + "reward_std": 0.3997432291507721, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1247187927365303, + "step": 4265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 929.2344360351562, + "completions/mean_terminated_length": 746.1636352539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9090618507271856, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14716167278936598, + "kl": 0.029632568359375, + "learning_rate": 1.226180692816831e-07, + "loss": 0.0859, + "num_tokens": 2322389525.0, + "reward": 2.3353796005249023, + "reward_std": 0.4462164640426636, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.14720545709133148, + "step": 4266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1090.3348388671875, + "completions/mean_terminated_length": 818.67626953125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9092749453944915, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11840403755799288, + "kl": 0.02435302734375, + "learning_rate": 1.2251339005852348e-07, + "loss": 0.0576, + "num_tokens": 2322957995.0, + "reward": 2.3387277126312256, + "reward_std": 0.41767117381095886, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14566238224506378, + "step": 4267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 900.7433471679688, + "completions/mean_terminated_length": 706.0391845703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9094880400617974, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15020398941292937, + "kl": 0.03167724609375, + "learning_rate": 1.2240894741648132e-07, + "loss": 0.0931, + "num_tokens": 2323427928.0, + "reward": 2.4988839626312256, + "reward_std": 0.4546898305416107, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.14203143119812012, + "step": 4268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1023.6183471679688, + "completions/mean_terminated_length": 843.4776611328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9097011347291034, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11696688299387133, + "kl": 0.026123046875, + "learning_rate": 1.223047414133574e-07, + "loss": 0.0577, + "num_tokens": 2323957469.0, + "reward": 2.4503350257873535, + "reward_std": 0.434961199760437, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 4269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1018.138427734375, + "completions/mean_terminated_length": 837.0341186523438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9099142293964093, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12419326936492042, + "kl": 0.027252197265625, + "learning_rate": 1.2220077210682178e-07, + "loss": 0.0757, + "num_tokens": 2324485291.0, + "reward": 2.4810268878936768, + "reward_std": 0.40673476457595825, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13915444910526276, + "step": 4270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 962.6964721679688, + "completions/mean_terminated_length": 778.5065307617188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9101273240637153, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12468537102183236, + "kl": 0.0291748046875, + "learning_rate": 1.2209703955441358e-07, + "loss": 0.0504, + "num_tokens": 2324980083.0, + "reward": 2.4302456378936768, + "reward_std": 0.4462333023548126, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14801737666130066, + "step": 4271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 961.450927734375, + "completions/mean_terminated_length": 728.8292846679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9103404187310212, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11582106158470883, + "kl": 0.0269775390625, + "learning_rate": 1.219935438135408e-07, + "loss": 0.0902, + "num_tokens": 2325472813.0, + "reward": 2.4291296005249023, + "reward_std": 0.408407986164093, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11919104307889938, + "step": 4272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 747.0469360351562, + "completions/mean_terminated_length": 622.9951171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9105535133983272, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14938501318552166, + "kl": 0.0360107421875, + "learning_rate": 1.2189028494148044e-07, + "loss": 0.0598, + "num_tokens": 2325870082.0, + "reward": 2.4810268878936768, + "reward_std": 0.3640085756778717, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.0999298095703125, + "step": 4273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 923.5201416015625, + "completions/mean_terminated_length": 729.2382202148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9107666080656331, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1605242157095266, + "kl": 0.03106689453125, + "learning_rate": 1.2178726299537836e-07, + "loss": 0.085, + "num_tokens": 2326358123.0, + "reward": 2.4765625, + "reward_std": 0.43555253744125366, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1314026117324829, + "step": 4274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 950.68310546875, + "completions/mean_terminated_length": 797.114501953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9109797027329392, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12827012684405112, + "kl": 0.029083251953125, + "learning_rate": 1.2168447803224925e-07, + "loss": 0.0508, + "num_tokens": 2326850141.0, + "reward": 2.4715402126312256, + "reward_std": 0.3935849368572235, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10092892497777939, + "step": 4275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1135.4129638671875, + "completions/mean_terminated_length": 856.0496215820312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9111927974002451, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14282375204998463, + "kl": 0.024200439453125, + "learning_rate": 1.2158193010897695e-07, + "loss": 0.1334, + "num_tokens": 2327431046.0, + "reward": 2.3404018878936768, + "reward_std": 0.4842710793018341, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14114972949028015, + "step": 4276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1236.5, + "completions/mean_terminated_length": 879.0225219726562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.911405892067551, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11805929002768534, + "kl": 0.022003173828125, + "learning_rate": 1.2147961928231356e-07, + "loss": 0.0893, + "num_tokens": 2328058022.0, + "reward": 2.3231027126312256, + "reward_std": 0.41856852173805237, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12428762763738632, + "step": 4277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1113.321533203125, + "completions/mean_terminated_length": 767.4617919921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.911618986734857, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13612824629957904, + "kl": 0.028472900390625, + "learning_rate": 1.2137754560888054e-07, + "loss": 0.0559, + "num_tokens": 2328625654.0, + "reward": 2.3621652126312256, + "reward_std": 0.431217223405838, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.16706722974777222, + "step": 4278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1051.841552734375, + "completions/mean_terminated_length": 790.8760375976562, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9118320814021629, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13698096561857806, + "kl": 0.025848388671875, + "learning_rate": 1.2127570914516777e-07, + "loss": 0.089, + "num_tokens": 2329174783.0, + "reward": 2.3309152126312256, + "reward_std": 0.41191932559013367, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15693362057209015, + "step": 4279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 897.4910888671875, + "completions/mean_terminated_length": 726.3897705078125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9120451760694689, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14422054934777795, + "kl": 0.03009033203125, + "learning_rate": 1.2117410994753396e-07, + "loss": 0.0906, + "num_tokens": 2329647339.0, + "reward": 2.4564733505249023, + "reward_std": 0.3915725350379944, + "rewards/accuracy_reward/mean": 0.5671296119689941, + "rewards/accuracy_reward/std": 0.4960475564002991, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14015565812587738, + "step": 4280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 916.747802734375, + "completions/mean_terminated_length": 735.0440063476562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9122582707367748, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1252950433684357, + "kl": 0.029754638671875, + "learning_rate": 1.2107274807220644e-07, + "loss": 0.0611, + "num_tokens": 2330133578.0, + "reward": 2.490513563156128, + "reward_std": 0.3581659495830536, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.09647680073976517, + "step": 4281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 939.5379638671875, + "completions/mean_terminated_length": 764.8191528320312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9124713654040808, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1356447306248167, + "kl": 0.028961181640625, + "learning_rate": 1.2097162357528126e-07, + "loss": 0.1131, + "num_tokens": 2330625659.0, + "reward": 2.3705358505249023, + "reward_std": 0.4865240156650543, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14767222106456757, + "step": 4282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 999.71435546875, + "completions/mean_terminated_length": 754.2479248046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9126844600713867, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11800086440673914, + "kl": 0.028350830078125, + "learning_rate": 1.2087073651272314e-07, + "loss": 0.0557, + "num_tokens": 2331140475.0, + "reward": 2.4324777126312256, + "reward_std": 0.37189552187919617, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12015076726675034, + "step": 4283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1002.6116333007812, + "completions/mean_terminated_length": 818.7769165039062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9128975547386927, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12836968933771142, + "kl": 0.02880859375, + "learning_rate": 1.2077008694036527e-07, + "loss": 0.0518, + "num_tokens": 2331659085.0, + "reward": 2.513951063156128, + "reward_std": 0.44159072637557983, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13588985800743103, + "step": 4284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1032.227783203125, + "completions/mean_terminated_length": 783.9277954101562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9131106494059986, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13537101938695856, + "kl": 0.026031494140625, + "learning_rate": 1.2066967491390963e-07, + "loss": 0.0671, + "num_tokens": 2332194867.0, + "reward": 2.4375, + "reward_std": 0.38138601183891296, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.14033812284469604, + "step": 4285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1214.69873046875, + "completions/mean_terminated_length": 862.8603515625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9133237440733045, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10336607880925264, + "kl": 0.021759033203125, + "learning_rate": 1.205695004889264e-07, + "loss": 0.0516, + "num_tokens": 2332814188.0, + "reward": 2.17578125, + "reward_std": 0.4046914279460907, + "rewards/accuracy_reward/mean": 0.2790178656578064, + "rewards/accuracy_reward/std": 0.449017733335495, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 4286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 992.1339721679688, + "completions/mean_terminated_length": 789.94677734375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.9135368387406105, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.8502097133076573, + "kl": 0.030029296875, + "learning_rate": 1.204695637208546e-07, + "loss": 0.0794, + "num_tokens": 2333326328.0, + "reward": 2.392857313156128, + "reward_std": 0.47749075293540955, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9598214030265808, + "rewards/tag_count_reward/std": 0.1605832874774933, + "step": 4287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 936.2277221679688, + "completions/mean_terminated_length": 757.65283203125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9137499334079164, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12488189365979543, + "kl": 0.028228759765625, + "learning_rate": 1.203698646650015e-07, + "loss": 0.0882, + "num_tokens": 2333818062.0, + "reward": 2.4135046005249023, + "reward_std": 0.4057147800922394, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10190140455961227, + "step": 4288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 950.4732666015625, + "completions/mean_terminated_length": 780.7525634765625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9139630280752225, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12598054630434816, + "kl": 0.02813720703125, + "learning_rate": 1.2027040337654287e-07, + "loss": 0.0384, + "num_tokens": 2334311762.0, + "reward": 2.3582589626312256, + "reward_std": 0.3835171163082123, + "rewards/accuracy_reward/mean": 0.48148149251937866, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389531940221786, + "step": 4289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1036.196533203125, + "completions/mean_terminated_length": 763.8980102539062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9141761227425284, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13386836270126953, + "kl": 0.026885986328125, + "learning_rate": 1.2017117991052303e-07, + "loss": 0.0685, + "num_tokens": 2334853018.0, + "reward": 2.40234375, + "reward_std": 0.42113855481147766, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13124457001686096, + "step": 4290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1098.32373046875, + "completions/mean_terminated_length": 846.1497192382812, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9143892174098344, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12084877075261727, + "kl": 0.022552490234375, + "learning_rate": 1.2007219432185455e-07, + "loss": 0.0689, + "num_tokens": 2335415099.0, + "reward": 2.415736675262451, + "reward_std": 0.4345530867576599, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11801212280988693, + "step": 4291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1020.497802734375, + "completions/mean_terminated_length": 803.8892211914062, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9146023120771403, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12828342832865927, + "kl": 0.026824951171875, + "learning_rate": 1.1997344666531832e-07, + "loss": 0.0489, + "num_tokens": 2335944458.0, + "reward": 2.4832589626312256, + "reward_std": 0.4384419023990631, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09158609062433243, + "step": 4292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1045.7098388671875, + "completions/mean_terminated_length": 807.5967407226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9148154067444462, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1556944719675982, + "kl": 0.030609130859375, + "learning_rate": 1.1987493699556365e-07, + "loss": 0.0506, + "num_tokens": 2336490952.0, + "reward": 2.4453125, + "reward_std": 0.39094749093055725, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12016765773296356, + "step": 4293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 978.7366333007812, + "completions/mean_terminated_length": 784.068603515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9150285014117522, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.39794369173073857, + "kl": 0.028778076171875, + "learning_rate": 1.1977666536710803e-07, + "loss": 0.112, + "num_tokens": 2337007042.0, + "reward": 2.37890625, + "reward_std": 0.402964323759079, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1446475237607956, + "step": 4294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 856.1986694335938, + "completions/mean_terminated_length": 719.8233642578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9152415960790581, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12127234165709, + "kl": 0.03399658203125, + "learning_rate": 1.196786318343374e-07, + "loss": 0.051, + "num_tokens": 2337448267.0, + "reward": 2.687500238418579, + "reward_std": 0.35360977053642273, + "rewards/accuracy_reward/mean": 0.7611607313156128, + "rewards/accuracy_reward/std": 0.4268510043621063, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10152244567871094, + "step": 4295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1016.2366333007812, + "completions/mean_terminated_length": 778.1373901367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9154546907463641, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1289341021294902, + "kl": 0.027374267578125, + "learning_rate": 1.1958083645150568e-07, + "loss": 0.1109, + "num_tokens": 2337977125.0, + "reward": 2.3671875, + "reward_std": 0.5158634185791016, + "rewards/accuracy_reward/mean": 0.4709821343421936, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15520283579826355, + "step": 4296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1020.2522583007812, + "completions/mean_terminated_length": 765.46240234375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.91566778541367, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14099058378325344, + "kl": 0.0260009765625, + "learning_rate": 1.1948327927273528e-07, + "loss": 0.0911, + "num_tokens": 2338503910.0, + "reward": 2.3861608505249023, + "reward_std": 0.47436875104904175, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1540280133485794, + "step": 4297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1019.4777221679688, + "completions/mean_terminated_length": 731.4913940429688, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.915880880080976, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1357149115479806, + "kl": 0.0299072265625, + "learning_rate": 1.1938596035201656e-07, + "loss": 0.0569, + "num_tokens": 2339022428.0, + "reward": 2.4559152126312256, + "reward_std": 0.4842139184474945, + "rewards/accuracy_reward/mean": 0.5892857313156128, + "rewards/accuracy_reward/std": 0.4925134479999542, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16430194675922394, + "step": 4298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 923.6027221679688, + "completions/mean_terminated_length": 708.2925415039062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9160939747482819, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.16919233253088392, + "kl": 0.030181884765625, + "learning_rate": 1.1928887974320806e-07, + "loss": 0.1233, + "num_tokens": 2339499194.0, + "reward": 2.40625, + "reward_std": 0.38667890429496765, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11036036163568497, + "step": 4299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 863.0781860351562, + "completions/mean_terminated_length": 717.5614013671875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9163070694155879, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1450599614166613, + "kl": 0.0333251953125, + "learning_rate": 1.191920375000365e-07, + "loss": 0.0441, + "num_tokens": 2339952333.0, + "reward": 2.5965402126312256, + "reward_std": 0.3840074837207794, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9893973469734192, + "rewards/tag_count_reward/std": 0.07496661692857742, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 915.529052734375, + "completions/mean_terminated_length": 733.6295166015625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9165201640828938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14443095441150788, + "kl": 0.031646728515625, + "learning_rate": 1.1909543367609663e-07, + "loss": 0.0902, + "num_tokens": 2340428650.0, + "reward": 2.4268975257873535, + "reward_std": 0.4631510078907013, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.12691166996955872, + "step": 4301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 979.5313110351562, + "completions/mean_terminated_length": 736.5643920898438, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9167332587501997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5350156622927484, + "kl": 0.034820556640625, + "learning_rate": 1.189990683248513e-07, + "loss": 0.0684, + "num_tokens": 2340949384.0, + "reward": 2.4185268878936768, + "reward_std": 0.4733780026435852, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13033421337604523, + "step": 4302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 884.7031860351562, + "completions/mean_terminated_length": 676.5342407226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9169463534175057, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14434482262669748, + "kl": 0.03131103515625, + "learning_rate": 1.1890294149963134e-07, + "loss": 0.1253, + "num_tokens": 2341405971.0, + "reward": 2.455357313156128, + "reward_std": 0.4411734342575073, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1378791630268097, + "step": 4303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1057.716552734375, + "completions/mean_terminated_length": 825.8319702148438, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.9171594480848116, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11460955396170243, + "kl": 0.025726318359375, + "learning_rate": 1.188070532536356e-07, + "loss": 0.0574, + "num_tokens": 2341957316.0, + "reward": 2.435826063156128, + "reward_std": 0.36691829562187195, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.09911917895078659, + "step": 4304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1074.3504638671875, + "completions/mean_terminated_length": 798.1575927734375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9173725427521177, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1297835240496017, + "kl": 0.024688720703125, + "learning_rate": 1.187114036399309e-07, + "loss": 0.0679, + "num_tokens": 2342507281.0, + "reward": 2.3950893878936768, + "reward_std": 0.38043105602264404, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 4305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1156.8504638671875, + "completions/mean_terminated_length": 877.222900390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9175856374194236, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11543840863198242, + "kl": 0.023590087890625, + "learning_rate": 1.1861599271145194e-07, + "loss": 0.0584, + "num_tokens": 2343091406.0, + "reward": 2.2427456378936768, + "reward_std": 0.4806744456291199, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16570167243480682, + "step": 4306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 999.232177734375, + "completions/mean_terminated_length": 811.5579223632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9177987320867296, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13559579595572716, + "kl": 0.026763916015625, + "learning_rate": 1.1852082052100142e-07, + "loss": 0.0722, + "num_tokens": 2343613174.0, + "reward": 2.37890625, + "reward_std": 0.4389708638191223, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13519908487796783, + "step": 4307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 942.1250610351562, + "completions/mean_terminated_length": 723.3155517578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9180118267540355, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1336596778119635, + "kl": 0.02728271484375, + "learning_rate": 1.184258871212497e-07, + "loss": 0.0772, + "num_tokens": 2344111438.0, + "reward": 2.4815850257873535, + "reward_std": 0.4398045539855957, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14372976124286652, + "step": 4308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1065.3035888671875, + "completions/mean_terminated_length": 797.2954711914062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9182249214213414, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11788909961402896, + "kl": 0.025848388671875, + "learning_rate": 1.1833119256473539e-07, + "loss": 0.0559, + "num_tokens": 2344662374.0, + "reward": 2.415736675262451, + "reward_std": 0.3968111276626587, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10550089925527573, + "step": 4309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1028.203125, + "completions/mean_terminated_length": 836.1458740234375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9184380160886474, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13206484514646272, + "kl": 0.025390625, + "learning_rate": 1.182367369038646e-07, + "loss": 0.0682, + "num_tokens": 2345191185.0, + "reward": 2.4224331378936768, + "reward_std": 0.45625853538513184, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390644073486, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.126290425658226, + "step": 4310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 940.638427734375, + "completions/mean_terminated_length": 772.6837768554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9186511107559533, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1344194588287428, + "kl": 0.029266357421875, + "learning_rate": 1.1814252019091115e-07, + "loss": 0.107, + "num_tokens": 2345686591.0, + "reward": 2.53125, + "reward_std": 0.4150005578994751, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10152245312929153, + "step": 4311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1006.107177734375, + "completions/mean_terminated_length": 755.0138549804688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9188642054232593, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12089684824738905, + "kl": 0.0262451171875, + "learning_rate": 1.1804854247801699e-07, + "loss": 0.0439, + "num_tokens": 2346200831.0, + "reward": 2.53515625, + "reward_std": 0.4017380475997925, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10703980922698975, + "step": 4312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 932.0982666015625, + "completions/mean_terminated_length": 782.36962890625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9190773000905652, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12793975353724973, + "kl": 0.027984619140625, + "learning_rate": 1.1795480381719142e-07, + "loss": 0.0586, + "num_tokens": 2346682939.0, + "reward": 2.5184152126312256, + "reward_std": 0.4725482761859894, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1199323758482933, + "step": 4313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1067.712158203125, + "completions/mean_terminated_length": 800.36083984375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9192903947578712, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.11571186625670028, + "kl": 0.023834228515625, + "learning_rate": 1.178613042603118e-07, + "loss": 0.068, + "num_tokens": 2347227914.0, + "reward": 2.4268975257873535, + "reward_std": 0.3438253104686737, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.09953393787145615, + "step": 4314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1040.7879638671875, + "completions/mean_terminated_length": 851.1007690429688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9195034894251771, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.121292778555046, + "kl": 0.024444580078125, + "learning_rate": 1.1776804385912288e-07, + "loss": 0.0488, + "num_tokens": 2347768491.0, + "reward": 2.478236675262451, + "reward_std": 0.41314926743507385, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10940459370613098, + "step": 4315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1045.83935546875, + "completions/mean_terminated_length": 772.5227661132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9197165840924831, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11483484926167137, + "kl": 0.02471923828125, + "learning_rate": 1.1767502266523702e-07, + "loss": 0.0825, + "num_tokens": 2348309075.0, + "reward": 2.458705425262451, + "reward_std": 0.3995319902896881, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11055814474821091, + "step": 4316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1081.9910888671875, + "completions/mean_terminated_length": 789.9418334960938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.919929678759789, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11789900994788009, + "kl": 0.025146484375, + "learning_rate": 1.1758224073013455e-07, + "loss": 0.0575, + "num_tokens": 2348872831.0, + "reward": 2.321986675262451, + "reward_std": 0.46032291650772095, + "rewards/accuracy_reward/mean": 0.4464285671710968, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15641570091247559, + "step": 4317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1017.3460083007812, + "completions/mean_terminated_length": 758.2429809570312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9201427734270949, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12007469625415519, + "kl": 0.02655029296875, + "learning_rate": 1.1748969810516305e-07, + "loss": 0.0574, + "num_tokens": 2349402250.0, + "reward": 2.364955425262451, + "reward_std": 0.4424870014190674, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11266100406646729, + "step": 4318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 883.7701416015625, + "completions/mean_terminated_length": 727.5569458007812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.920355868094401, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15157029828532473, + "kl": 0.03057861328125, + "learning_rate": 1.1739739484153782e-07, + "loss": 0.1098, + "num_tokens": 2349879603.0, + "reward": 2.484375, + "reward_std": 0.43522077798843384, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14383503794670105, + "step": 4319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1041.3125, + "completions/mean_terminated_length": 795.2333374023438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9205689627617069, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13355554414722376, + "kl": 0.02606201171875, + "learning_rate": 1.1730533099034159e-07, + "loss": 0.0798, + "num_tokens": 2350412431.0, + "reward": 2.4129464626312256, + "reward_std": 0.39991295337677, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16516682505607605, + "step": 4320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 999.982177734375, + "completions/mean_terminated_length": 789.2546997070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9207820574290129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15804772731671182, + "kl": 0.027496337890625, + "learning_rate": 1.1721350660252484e-07, + "loss": 0.0888, + "num_tokens": 2350929383.0, + "reward": 2.3705358505249023, + "reward_std": 0.49632328748703003, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1139446347951889, + "step": 4321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1178.1585693359375, + "completions/mean_terminated_length": 863.5349731445312, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9209951520963188, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11561173890677831, + "kl": 0.025634765625, + "learning_rate": 1.171219217289051e-07, + "loss": 0.0705, + "num_tokens": 2351536462.0, + "reward": 2.1919643878936768, + "reward_std": 0.4486224949359894, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47548985481262207, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387791991233826, + "rewards/tag_count_reward/mean": 0.9508928656578064, + "rewards/tag_count_reward/std": 0.17882691323757172, + "step": 4322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1071.388427734375, + "completions/mean_terminated_length": 808.5609130859375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9212082467636248, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11506240459693362, + "kl": 0.024932861328125, + "learning_rate": 1.170305764201677e-07, + "loss": 0.0467, + "num_tokens": 2352084636.0, + "reward": 2.396205425262451, + "reward_std": 0.4571070969104767, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1662929803133011, + "step": 4323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1017.8214721679688, + "completions/mean_terminated_length": 823.8090209960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9214213414309307, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14698407357256527, + "kl": 0.027099609375, + "learning_rate": 1.1693947072686525e-07, + "loss": 0.1331, + "num_tokens": 2352605356.0, + "reward": 2.4854912757873535, + "reward_std": 0.5262283682823181, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16962288320064545, + "step": 4324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1063.357177734375, + "completions/mean_terminated_length": 812.3697509765625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9216344360982366, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11957730266521888, + "kl": 0.02593994140625, + "learning_rate": 1.1684860469941785e-07, + "loss": 0.0543, + "num_tokens": 2353149308.0, + "reward": 2.4441964626312256, + "reward_std": 0.387107789516449, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1136813834309578, + "step": 4325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 935.2053833007812, + "completions/mean_terminated_length": 742.9424438476562, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9218475307655426, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12937431877539943, + "kl": 0.029693603515625, + "learning_rate": 1.167579783881128e-07, + "loss": 0.0381, + "num_tokens": 2353638248.0, + "reward": 2.4614956378936768, + "reward_std": 0.3544231057167053, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11338312178850174, + "step": 4326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 943.71435546875, + "completions/mean_terminated_length": 728.7466430664062, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9220606254328485, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12490186245087269, + "kl": 0.02740478515625, + "learning_rate": 1.1666759184310484e-07, + "loss": 0.0873, + "num_tokens": 2354121480.0, + "reward": 2.50390625, + "reward_std": 0.331051766872406, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13311463594436646, + "step": 4327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1001.107177734375, + "completions/mean_terminated_length": 759.5164794921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9222737201001545, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14375470230631254, + "kl": 0.028717041015625, + "learning_rate": 1.1657744511441606e-07, + "loss": 0.0886, + "num_tokens": 2354643128.0, + "reward": 2.4363839626312256, + "reward_std": 0.4349292516708374, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1421368569135666, + "step": 4328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1097.703125, + "completions/mean_terminated_length": 912.7119750976562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9224868147674604, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12177590231414014, + "kl": 0.0252685546875, + "learning_rate": 1.1648753825193577e-07, + "loss": 0.0652, + "num_tokens": 2355204931.0, + "reward": 2.407924175262451, + "reward_std": 0.5403696894645691, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9481026530265808, + "rewards/tag_count_reward/std": 0.17998811602592468, + "step": 4329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 987.4777221679688, + "completions/mean_terminated_length": 739.14599609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9226999094347664, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1244516265434499, + "kl": 0.02679443359375, + "learning_rate": 1.1639787130542042e-07, + "loss": 0.0752, + "num_tokens": 2355718857.0, + "reward": 2.4542412757873535, + "reward_std": 0.38402825593948364, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.49767759442329407, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12016765773296356, + "step": 4330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1038.0, + "completions/mean_terminated_length": 787.6099853515625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9229130041020723, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1287583069659273, + "kl": 0.024749755859375, + "learning_rate": 1.1630844432449396e-07, + "loss": 0.0861, + "num_tokens": 2356253417.0, + "reward": 2.4715402126312256, + "reward_std": 0.4852500259876251, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17179030179977417, + "step": 4331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1043.4442138671875, + "completions/mean_terminated_length": 804.7928466796875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9231260987693783, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1246779542824781, + "kl": 0.025909423828125, + "learning_rate": 1.1621925735864729e-07, + "loss": 0.0749, + "num_tokens": 2356790528.0, + "reward": 2.459263563156128, + "reward_std": 0.4631551504135132, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.15343420207500458, + "step": 4332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 917.7232666015625, + "completions/mean_terminated_length": 759.5419921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9233391934366842, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12900315545708918, + "kl": 0.02825927734375, + "learning_rate": 1.1613031045723862e-07, + "loss": 0.0738, + "num_tokens": 2357271044.0, + "reward": 2.486607313156128, + "reward_std": 0.31329184770584106, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10517053306102753, + "step": 4333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1012.76123046875, + "completions/mean_terminated_length": 824.2876586914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9235522881039901, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1429332560598743, + "kl": 0.0281982421875, + "learning_rate": 1.1604160366949318e-07, + "loss": 0.0978, + "num_tokens": 2357792905.0, + "reward": 2.4073662757873535, + "reward_std": 0.37460365891456604, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12758491933345795, + "step": 4334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1106.8035888671875, + "completions/mean_terminated_length": 832.85302734375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9237653827712962, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13513841988564942, + "kl": 0.024566650390625, + "learning_rate": 1.1595313704450339e-07, + "loss": 0.0673, + "num_tokens": 2358358865.0, + "reward": 2.3275671005249023, + "reward_std": 0.3640348017215729, + "rewards/accuracy_reward/mean": 0.4107142984867096, + "rewards/accuracy_reward/std": 0.4925134778022766, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.10996230691671371, + "step": 4335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 978.58935546875, + "completions/mean_terminated_length": 766.9946899414062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9239784774386021, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16777596381695412, + "kl": 0.029052734375, + "learning_rate": 1.1586491063122883e-07, + "loss": 0.1057, + "num_tokens": 2358866601.0, + "reward": 2.3978796005249023, + "reward_std": 0.4449254870414734, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16232208907604218, + "step": 4336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1036.3013916015625, + "completions/mean_terminated_length": 813.0108642578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9241915721059081, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12252475126418538, + "kl": 0.0250244140625, + "learning_rate": 1.1577692447849605e-07, + "loss": 0.0621, + "num_tokens": 2359397712.0, + "reward": 2.400111675262451, + "reward_std": 0.45364245772361755, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13485698401927948, + "step": 4337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1109.2366943359375, + "completions/mean_terminated_length": 818.2748413085938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.924404666773214, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.10873342068510086, + "kl": 0.022979736328125, + "learning_rate": 1.1568917863499861e-07, + "loss": 0.0388, + "num_tokens": 2359966026.0, + "reward": 2.4285714626312256, + "reward_std": 0.35046347975730896, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10818972438573837, + "step": 4338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1090.4754638671875, + "completions/mean_terminated_length": 782.5988159179688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.92461776144052, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12570502365110353, + "kl": 0.0245361328125, + "learning_rate": 1.1560167314929714e-07, + "loss": 0.1176, + "num_tokens": 2360529823.0, + "reward": 2.4090402126312256, + "reward_std": 0.49671119451522827, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15693362057209015, + "step": 4339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1079.9866943359375, + "completions/mean_terminated_length": 843.3611450195312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9248308561078259, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1336479273820792, + "kl": 0.02447509765625, + "learning_rate": 1.155144080698192e-07, + "loss": 0.0664, + "num_tokens": 2361091177.0, + "reward": 2.310826063156128, + "reward_std": 0.44756436347961426, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14869897067546844, + "step": 4340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 916.6763916015625, + "completions/mean_terminated_length": 745.08740234375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9250439507751319, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12520072399968074, + "kl": 0.030731201171875, + "learning_rate": 1.1542738344485942e-07, + "loss": 0.0447, + "num_tokens": 2361569544.0, + "reward": 2.447544813156128, + "reward_std": 0.3588140308856964, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14259286224842072, + "step": 4341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 823.8839721679688, + "completions/mean_terminated_length": 690.5643310546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9252570454424378, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.15758728483092396, + "kl": 0.03179931640625, + "learning_rate": 1.1534059932257908e-07, + "loss": 0.0499, + "num_tokens": 2362001636.0, + "reward": 2.6205358505249023, + "reward_std": 0.3683219254016876, + "rewards/accuracy_reward/mean": 0.7175925970077515, + "rewards/accuracy_reward/std": 0.45069241523742676, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.11923423409461975, + "step": 4342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1136.8192138671875, + "completions/mean_terminated_length": 861.345947265625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9254701401097437, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13701561953919866, + "kl": 0.023529052734375, + "learning_rate": 1.1525405575100678e-07, + "loss": 0.0378, + "num_tokens": 2362580803.0, + "reward": 2.424107313156128, + "reward_std": 0.43557584285736084, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1353386640548706, + "step": 4343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 952.43310546875, + "completions/mean_terminated_length": 769.8385620117188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9256832347770497, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12526622000718188, + "kl": 0.02789306640625, + "learning_rate": 1.151677527780376e-07, + "loss": 0.0747, + "num_tokens": 2363079669.0, + "reward": 2.462611675262451, + "reward_std": 0.39819738268852234, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174957811832428, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12109260261058807, + "step": 4344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 932.68310546875, + "completions/mean_terminated_length": 766.8154296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9258963294443556, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13550302359859, + "kl": 0.030517578125, + "learning_rate": 1.1508169045143376e-07, + "loss": 0.0775, + "num_tokens": 2363567431.0, + "reward": 2.4207589626312256, + "reward_std": 0.4839977025985718, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1573437601327896, + "step": 4345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1011.247802734375, + "completions/mean_terminated_length": 746.9776000976562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9261094241116616, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11051203102371328, + "kl": 0.0272216796875, + "learning_rate": 1.1499586881882404e-07, + "loss": 0.0481, + "num_tokens": 2364096358.0, + "reward": 2.4285714626312256, + "reward_std": 0.42416635155677795, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14480386674404144, + "step": 4346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1011.24560546875, + "completions/mean_terminated_length": 768.4793701171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9263225187789675, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12206389706978321, + "kl": 0.027496337890625, + "learning_rate": 1.1491028792770419e-07, + "loss": 0.0301, + "num_tokens": 2364617636.0, + "reward": 2.411830425262451, + "reward_std": 0.4586961269378662, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13351379334926605, + "step": 4347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1064.34375, + "completions/mean_terminated_length": 860.1886596679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9265356134462736, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12648405440577978, + "kl": 0.025299072265625, + "learning_rate": 1.1482494782543672e-07, + "loss": 0.0952, + "num_tokens": 2365165374.0, + "reward": 2.404017925262451, + "reward_std": 0.3988664746284485, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10557335615158081, + "step": 4348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 893.9710083007812, + "completions/mean_terminated_length": 755.4874877929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9267487081135795, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13711745422304336, + "kl": 0.030181884765625, + "learning_rate": 1.147398485592508e-07, + "loss": 0.1287, + "num_tokens": 2365627249.0, + "reward": 2.5691964626312256, + "reward_std": 0.42172396183013916, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.47764313220977783, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11286582797765732, + "step": 4349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1079.2076416015625, + "completions/mean_terminated_length": 814.9915161132812, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9269618027808854, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14518643508514753, + "kl": 0.0277099609375, + "learning_rate": 1.1465499017624243e-07, + "loss": 0.1067, + "num_tokens": 2366181118.0, + "reward": 2.3286831378936768, + "reward_std": 0.4309108257293701, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.13648569583892822, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1105.8348388671875, + "completions/mean_terminated_length": 878.7755737304688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9271748974481914, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12323064942671946, + "kl": 0.024017333984375, + "learning_rate": 1.1457037272337414e-07, + "loss": 0.1172, + "num_tokens": 2366750724.0, + "reward": 2.375, + "reward_std": 0.523169755935669, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.17942628264427185, + "step": 4351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 890.6473388671875, + "completions/mean_terminated_length": 751.7649536132812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9273879921154973, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.136656650325123, + "kl": 0.0301513671875, + "learning_rate": 1.1448599624747523e-07, + "loss": 0.0549, + "num_tokens": 2367215318.0, + "reward": 2.541294813156128, + "reward_std": 0.37390655279159546, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12805373966693878, + "step": 4352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1010.7076416015625, + "completions/mean_terminated_length": 788.6314697265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9276010867828033, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12430968068296423, + "kl": 0.028839111328125, + "learning_rate": 1.144018607952417e-07, + "loss": 0.0639, + "num_tokens": 2367732739.0, + "reward": 2.41796875, + "reward_std": 0.4178357720375061, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1199323832988739, + "step": 4353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1202.21875, + "completions/mean_terminated_length": 906.7047729492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9278141814501092, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11453165564394323, + "kl": 0.02557373046875, + "learning_rate": 1.143179664132359e-07, + "loss": 0.0571, + "num_tokens": 2368344981.0, + "reward": 2.3777902126312256, + "reward_std": 0.503399133682251, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.15795646607875824, + "step": 4354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 879.857177734375, + "completions/mean_terminated_length": 726.4646606445312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9280272761174152, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12289841008234728, + "kl": 0.02880859375, + "learning_rate": 1.142343131478872e-07, + "loss": 0.0077, + "num_tokens": 2368806389.0, + "reward": 2.5496652126312256, + "reward_std": 0.3370528519153595, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10763297230005264, + "step": 4355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1047.247802734375, + "completions/mean_terminated_length": 852.4346313476562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9282403707847211, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12407789343582892, + "kl": 0.025054931640625, + "learning_rate": 1.141509010454911e-07, + "loss": 0.0489, + "num_tokens": 2369347636.0, + "reward": 2.435267925262451, + "reward_std": 0.44633668661117554, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.13465432822704315, + "step": 4356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 903.6741333007812, + "completions/mean_terminated_length": 763.142822265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9284534654520271, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1285686642720892, + "kl": 0.029205322265625, + "learning_rate": 1.1406773015220993e-07, + "loss": 0.0475, + "num_tokens": 2369823218.0, + "reward": 2.53515625, + "reward_std": 0.47579964995384216, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13927440345287323, + "step": 4357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 775.872802734375, + "completions/mean_terminated_length": 637.32421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.928666560119333, + "frac_reward_zero_std": 0.3214285969734192, + "grad_norm": 0.1306281199466388, + "kl": 0.03546142578125, + "learning_rate": 1.1398480051407226e-07, + "loss": 0.1102, + "num_tokens": 2370232505.0, + "reward": 2.640625, + "reward_std": 0.30411162972450256, + "rewards/accuracy_reward/mean": 0.6941964030265808, + "rewards/accuracy_reward/std": 0.4612620174884796, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08709356933832169, + "step": 4358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 942.3035888671875, + "completions/mean_terminated_length": 777.86669921875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9288796547866389, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14897681364637724, + "kl": 0.031158447265625, + "learning_rate": 1.1390211217697346e-07, + "loss": 0.048, + "num_tokens": 2370722385.0, + "reward": 2.4135046005249023, + "reward_std": 0.4109794795513153, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396106719971, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.13851940631866455, + "step": 4359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1152.40185546875, + "completions/mean_terminated_length": 920.955078125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9290927494539449, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11341616821391098, + "kl": 0.025787353515625, + "learning_rate": 1.138196651866751e-07, + "loss": 0.0688, + "num_tokens": 2371311893.0, + "reward": 2.4229912757873535, + "reward_std": 0.4458398222923279, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15822990238666534, + "step": 4360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1001.9152221679688, + "completions/mean_terminated_length": 827.5677490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9293058441212508, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13453878503123481, + "kl": 0.02850341796875, + "learning_rate": 1.1373745958880523e-07, + "loss": 0.0722, + "num_tokens": 2371826959.0, + "reward": 2.540736675262451, + "reward_std": 0.45150238275527954, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.15279823541641235, + "step": 4361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 911.5469360351562, + "completions/mean_terminated_length": 729.0077514648438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9295189387885568, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13133984613212665, + "kl": 0.027862548828125, + "learning_rate": 1.1365549542885853e-07, + "loss": 0.0907, + "num_tokens": 2372303140.0, + "reward": 2.3989956378936768, + "reward_std": 0.38063284754753113, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10439462959766388, + "step": 4362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1037.212158203125, + "completions/mean_terminated_length": 827.4258422851562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9297320334558627, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11123847437447203, + "kl": 0.026214599609375, + "learning_rate": 1.1357377275219578e-07, + "loss": 0.0719, + "num_tokens": 2372841523.0, + "reward": 2.4146206378936768, + "reward_std": 0.3301648199558258, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10962118953466415, + "step": 4363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 911.9129638671875, + "completions/mean_terminated_length": 732.8397827148438, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.9299451281231688, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.16768318350005512, + "kl": 0.031524658203125, + "learning_rate": 1.1349229160404416e-07, + "loss": 0.0652, + "num_tokens": 2373325260.0, + "reward": 2.5245537757873535, + "reward_std": 0.46250128746032715, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295229911804, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12782442569732666, + "step": 4364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 840.8058471679688, + "completions/mean_terminated_length": 661.2743530273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9301582227904747, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14941438538884247, + "kl": 0.033905029296875, + "learning_rate": 1.1341105202949734e-07, + "loss": 0.076, + "num_tokens": 2373766085.0, + "reward": 2.4799108505249023, + "reward_std": 0.37186330556869507, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.12569716572761536, + "step": 4365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1156.5692138671875, + "completions/mean_terminated_length": 913.4517211914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9303713174577806, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11774759243800054, + "kl": 0.023162841796875, + "learning_rate": 1.1333005407351516e-07, + "loss": 0.0731, + "num_tokens": 2374355508.0, + "reward": 2.314732313156128, + "reward_std": 0.42696940898895264, + "rewards/accuracy_reward/mean": 0.4174107015132904, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1373893767595291, + "step": 4366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 926.544677734375, + "completions/mean_terminated_length": 776.0708618164062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9305844121250866, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1285513172212822, + "kl": 0.025482177734375, + "learning_rate": 1.1324929778092393e-07, + "loss": 0.0782, + "num_tokens": 2374833352.0, + "reward": 2.4497768878936768, + "reward_std": 0.44200292229652405, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13607001304626465, + "step": 4367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 999.6808471679688, + "completions/mean_terminated_length": 808.8258666992188, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9307975067923925, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10451063452429236, + "kl": 0.025665283203125, + "learning_rate": 1.1316878319641586e-07, + "loss": 0.0607, + "num_tokens": 2375353929.0, + "reward": 2.513951063156128, + "reward_std": 0.3639531433582306, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.08475254476070404, + "step": 4368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1015.7410888671875, + "completions/mean_terminated_length": 781.0082397460938, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9310106014596985, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1305959057171984, + "kl": 0.026397705078125, + "learning_rate": 1.1308851036454973e-07, + "loss": 0.0845, + "num_tokens": 2375880357.0, + "reward": 2.4229912757873535, + "reward_std": 0.3985334038734436, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1406535804271698, + "step": 4369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1042.0491943359375, + "completions/mean_terminated_length": 764.05126953125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9312236961270044, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 1.2971856541226816, + "kl": 0.035675048828125, + "learning_rate": 1.1300847932975042e-07, + "loss": 0.0341, + "num_tokens": 2376419739.0, + "reward": 2.2176339626312256, + "reward_std": 0.36447110772132874, + "rewards/accuracy_reward/mean": 0.3303571343421936, + "rewards/accuracy_reward/std": 0.4708675146102905, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12758491933345795, + "step": 4370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1001.2433471679688, + "completions/mean_terminated_length": 820.3900756835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9314367907943104, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1578556177760365, + "kl": 0.027587890625, + "learning_rate": 1.1292869013630895e-07, + "loss": 0.0934, + "num_tokens": 2376944280.0, + "reward": 2.3989956378936768, + "reward_std": 0.4837868809700012, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1552460640668869, + "step": 4371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1080.296875, + "completions/mean_terminated_length": 876.2946166992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9316498854616163, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.132885951101939, + "kl": 0.026763916015625, + "learning_rate": 1.128491428283825e-07, + "loss": 0.1011, + "num_tokens": 2377492509.0, + "reward": 2.4068081378936768, + "reward_std": 0.5362050533294678, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.170974463224411, + "step": 4372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1153.212158203125, + "completions/mean_terminated_length": 892.7694091796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9318629801289223, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11481821301552803, + "kl": 0.023223876953125, + "learning_rate": 1.1276983744999442e-07, + "loss": 0.053, + "num_tokens": 2378080108.0, + "reward": 2.322544813156128, + "reward_std": 0.35287967324256897, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11345604062080383, + "step": 4373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 941.6116333007812, + "completions/mean_terminated_length": 750.4555053710938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9320760747962282, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12129912300372475, + "kl": 0.026214599609375, + "learning_rate": 1.1269077404503432e-07, + "loss": 0.0768, + "num_tokens": 2378579518.0, + "reward": 2.4760046005249023, + "reward_std": 0.4287765920162201, + "rewards/accuracy_reward/mean": 0.5535714030265808, + "rewards/accuracy_reward/std": 0.4976775646209717, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11919102817773819, + "step": 4374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 986.77685546875, + "completions/mean_terminated_length": 752.5558471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9322891694635341, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13361245328437274, + "kl": 0.028350830078125, + "learning_rate": 1.1261195265725756e-07, + "loss": 0.0368, + "num_tokens": 2379089434.0, + "reward": 2.4575893878936768, + "reward_std": 0.36607611179351807, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.10992966592311859, + "step": 4375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1007.0469360351562, + "completions/mean_terminated_length": 820.7711181640625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.9325022641308401, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13008563735391598, + "kl": 0.02606201171875, + "learning_rate": 1.125333733302857e-07, + "loss": 0.0648, + "num_tokens": 2379609711.0, + "reward": 2.3895089626312256, + "reward_std": 0.4988686144351959, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854744791984558, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16375111043453217, + "step": 4376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1101.2545166015625, + "completions/mean_terminated_length": 836.1657104492188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.932715358798146, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12968861329862844, + "kl": 0.025177001953125, + "learning_rate": 1.1245503610760662e-07, + "loss": 0.1161, + "num_tokens": 2380179105.0, + "reward": 2.3777902126312256, + "reward_std": 0.4191914200782776, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13335825502872467, + "step": 4377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 877.2567138671875, + "completions/mean_terminated_length": 706.585693359375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9329284534654521, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13647455532216726, + "kl": 0.03204345703125, + "learning_rate": 1.1237694103257375e-07, + "loss": 0.1044, + "num_tokens": 2380636100.0, + "reward": 2.6450893878936768, + "reward_std": 0.4329272508621216, + "rewards/accuracy_reward/mean": 0.7410714030265808, + "rewards/accuracy_reward/std": 0.43853598833084106, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12891364097595215, + "step": 4378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 909.6495971679688, + "completions/mean_terminated_length": 698.8438720703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.933141548132758, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.214846058861817, + "kl": 0.0323486328125, + "learning_rate": 1.1229908814840692e-07, + "loss": 0.0847, + "num_tokens": 2381114215.0, + "reward": 2.3582589626312256, + "reward_std": 0.43972936272621155, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9542410969734192, + "rewards/tag_count_reward/std": 0.16929872334003448, + "step": 4379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 929.1406860351562, + "completions/mean_terminated_length": 714.8909301757812, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.933354642800064, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14589530696516964, + "kl": 0.029815673828125, + "learning_rate": 1.1222147749819157e-07, + "loss": 0.0477, + "num_tokens": 2381605382.0, + "reward": 2.411830425262451, + "reward_std": 0.4584210216999054, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1314026117324829, + "step": 4380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 962.5357666015625, + "completions/mean_terminated_length": 754.6808471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9335677374673699, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13908846066914746, + "kl": 0.028228759765625, + "learning_rate": 1.1214410912487935e-07, + "loss": 0.1204, + "num_tokens": 2382109718.0, + "reward": 2.4503350257873535, + "reward_std": 0.4439104199409485, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1427011638879776, + "step": 4381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 934.3527221679688, + "completions/mean_terminated_length": 752.1194458007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9337808321346758, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12124389500192245, + "kl": 0.027587890625, + "learning_rate": 1.1206698307128779e-07, + "loss": 0.0604, + "num_tokens": 2382595908.0, + "reward": 2.453125, + "reward_std": 0.33934077620506287, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08547309041023254, + "step": 4382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 893.07373046875, + "completions/mean_terminated_length": 711.031005859375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9339939268019818, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.11985472897366503, + "kl": 0.02740478515625, + "learning_rate": 1.119900993801001e-07, + "loss": -0.0122, + "num_tokens": 2383065029.0, + "reward": 2.5541296005249023, + "reward_std": 0.287217915058136, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9732142686843872, + "rewards/format_reward/std": 0.1616371124982834, + "rewards/tag_count_reward/mean": 0.9871651530265808, + "rewards/tag_count_reward/std": 0.08991295844316483, + "step": 4383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 979.8438110351562, + "completions/mean_terminated_length": 775.3031616210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9342070214692877, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12333641176694152, + "kl": 0.03045654296875, + "learning_rate": 1.1191345809386565e-07, + "loss": 0.0322, + "num_tokens": 2383572351.0, + "reward": 2.3989956378936768, + "reward_std": 0.35006973147392273, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.09933306276798248, + "step": 4384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 846.0469360351562, + "completions/mean_terminated_length": 731.4352416992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9344201161365937, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1414336794581791, + "kl": 0.03057861328125, + "learning_rate": 1.1183705925499947e-07, + "loss": 0.0607, + "num_tokens": 2384014212.0, + "reward": 2.584263563156128, + "reward_std": 0.35953983664512634, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1208655834197998, + "step": 4385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 926.7991333007812, + "completions/mean_terminated_length": 736.5169677734375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9346332108038996, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12186933627002564, + "kl": 0.029083251953125, + "learning_rate": 1.1176090290578244e-07, + "loss": 0.0515, + "num_tokens": 2384501162.0, + "reward": 2.564174175262451, + "reward_std": 0.39993947744369507, + "rewards/accuracy_reward/mean": 0.6495535969734192, + "rewards/accuracy_reward/std": 0.4776431620121002, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12015076726675034, + "step": 4386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1020.2344360351562, + "completions/mean_terminated_length": 789.9699096679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9348463054712056, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11369895684751384, + "kl": 0.028076171875, + "learning_rate": 1.1168498908836136e-07, + "loss": 0.0419, + "num_tokens": 2385024467.0, + "reward": 2.458705425262451, + "reward_std": 0.3179273009300232, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.09750170260667801, + "step": 4387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1100.2567138671875, + "completions/mean_terminated_length": 845.1983032226562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9350594001385115, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1360018368592396, + "kl": 0.0242919921875, + "learning_rate": 1.1160931784474858e-07, + "loss": 0.1064, + "num_tokens": 2385588278.0, + "reward": 2.27734375, + "reward_std": 0.4707585871219635, + "rewards/accuracy_reward/mean": 0.41203704476356506, + "rewards/accuracy_reward/std": 0.4927724003791809, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16933098435401917, + "step": 4388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 879.55810546875, + "completions/mean_terminated_length": 698.87109375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9352724948058175, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.13263314476248234, + "kl": 0.034912109375, + "learning_rate": 1.1153388921682253e-07, + "loss": 0.0084, + "num_tokens": 2386050160.0, + "reward": 2.4112725257873535, + "reward_std": 0.3310636878013611, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10811903327703476, + "step": 4389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 876.7031860351562, + "completions/mean_terminated_length": 729.5552978515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9354855894731234, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13724809157990736, + "kl": 0.029754638671875, + "learning_rate": 1.1145870324632704e-07, + "loss": 0.0237, + "num_tokens": 2386510299.0, + "reward": 2.4174108505249023, + "reward_std": 0.2807876467704773, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9665178656578064, + "rewards/format_reward/std": 0.1800929754972458, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08547309041023254, + "step": 4390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1043.0179443359375, + "completions/mean_terminated_length": 824.5435180664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9356986841404293, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13280877350636944, + "kl": 0.0255126953125, + "learning_rate": 1.1138375997487187e-07, + "loss": 0.0904, + "num_tokens": 2387046835.0, + "reward": 2.4017858505249023, + "reward_std": 0.4428028166294098, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09910815209150314, + "step": 4391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1039.74560546875, + "completions/mean_terminated_length": 830.4851684570312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9359117788077354, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12571804545203322, + "kl": 0.025238037109375, + "learning_rate": 1.1130905944393228e-07, + "loss": 0.0243, + "num_tokens": 2387585841.0, + "reward": 2.3465402126312256, + "reward_std": 0.3728891909122467, + "rewards/accuracy_reward/mean": 0.4196428656578064, + "rewards/accuracy_reward/std": 0.4940522015094757, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12035840004682541, + "step": 4392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 908.107177734375, + "completions/mean_terminated_length": 768.1203002929688, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.9361248734750413, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13581340330210478, + "kl": 0.0308837890625, + "learning_rate": 1.112346016948494e-07, + "loss": 0.1104, + "num_tokens": 2388052401.0, + "reward": 2.5089287757873535, + "reward_std": 0.4902397692203522, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48466411232948303, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14140157401561737, + "step": 4393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1066.4285888671875, + "completions/mean_terminated_length": 819.664794921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9363379681423473, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.9904674160188459, + "kl": 0.028839111328125, + "learning_rate": 1.1116038676882983e-07, + "loss": 0.0221, + "num_tokens": 2388606625.0, + "reward": 2.4246652126312256, + "reward_std": 0.4120630621910095, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1131737232208252, + "step": 4394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 852.1920166015625, + "completions/mean_terminated_length": 731.729736328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9365510628096532, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1392593356652518, + "kl": 0.031494140625, + "learning_rate": 1.1108641470694582e-07, + "loss": 0.0252, + "num_tokens": 2389053943.0, + "reward": 2.474888563156128, + "reward_std": 0.369652658700943, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10304656624794006, + "step": 4395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 966.310302734375, + "completions/mean_terminated_length": 805.443603515625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9367641574769592, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1716921415544946, + "kl": 0.031768798828125, + "learning_rate": 1.110126855501354e-07, + "loss": 0.0666, + "num_tokens": 2389556482.0, + "reward": 2.3013393878936768, + "reward_std": 0.4197487235069275, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13888955116271973, + "step": 4396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1038.071533203125, + "completions/mean_terminated_length": 841.4719848632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9369772521442651, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11352532306758972, + "kl": 0.025604248046875, + "learning_rate": 1.1093919933920187e-07, + "loss": 0.0462, + "num_tokens": 2390091314.0, + "reward": 2.377232313156128, + "reward_std": 0.4040103852748871, + "rewards/accuracy_reward/mean": 0.4508928656578064, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.10383255779743195, + "step": 4397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1136.3348388671875, + "completions/mean_terminated_length": 881.0685424804688, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9371903468115711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1310437393972883, + "kl": 0.026153564453125, + "learning_rate": 1.1086595611481425e-07, + "loss": 0.0964, + "num_tokens": 2390676120.0, + "reward": 2.3643975257873535, + "reward_std": 0.5020871758460999, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15332838892936707, + "step": 4398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 883.1339721679688, + "completions/mean_terminated_length": 685.4412841796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.937403441478877, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14230780157464434, + "kl": 0.0322265625, + "learning_rate": 1.1079295591750711e-07, + "loss": 0.0671, + "num_tokens": 2391136580.0, + "reward": 2.506138563156128, + "reward_std": 0.40291139483451843, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10073082149028778, + "step": 4399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 891.2366333007812, + "completions/mean_terminated_length": 701.9480590820312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9376165361461829, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1253766819664411, + "kl": 0.0325927734375, + "learning_rate": 1.1072019878768041e-07, + "loss": 0.0551, + "num_tokens": 2391606270.0, + "reward": 2.59765625, + "reward_std": 0.36383917927742004, + "rewards/accuracy_reward/mean": 0.6830357313156128, + "rewards/accuracy_reward/std": 0.4658135175704956, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13669590651988983, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1183.1741943359375, + "completions/mean_terminated_length": 841.0155639648438, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9378296308134889, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12394164739988968, + "kl": 0.022552490234375, + "learning_rate": 1.1064768476559973e-07, + "loss": 0.0876, + "num_tokens": 2392204556.0, + "reward": 2.3236608505249023, + "reward_std": 0.4416102170944214, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13686132431030273, + "step": 4401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1059.7388916015625, + "completions/mean_terminated_length": 851.4027099609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9380427254807948, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13051586307945492, + "kl": 0.02874755859375, + "learning_rate": 1.105754138913959e-07, + "loss": 0.0677, + "num_tokens": 2392745399.0, + "reward": 2.498326063156128, + "reward_std": 0.43339961767196655, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.1399807333946228, + "step": 4402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 991.0491333007812, + "completions/mean_terminated_length": 785.2960205078125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9382558201481008, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12506374457362188, + "kl": 0.02874755859375, + "learning_rate": 1.1050338620506544e-07, + "loss": 0.0658, + "num_tokens": 2393257549.0, + "reward": 2.5424108505249023, + "reward_std": 0.4021592438220978, + "rewards/accuracy_reward/mean": 0.6450892686843872, + "rewards/accuracy_reward/std": 0.4790211617946625, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.14942027628421783, + "step": 4403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 822.1094360351562, + "completions/mean_terminated_length": 718.2203979492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9384689148154067, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14623618122121235, + "kl": 0.03240966796875, + "learning_rate": 1.104316017464701e-07, + "loss": 0.0674, + "num_tokens": 2393697838.0, + "reward": 2.646205425262451, + "reward_std": 0.3961551785469055, + "rewards/accuracy_reward/mean": 0.7410714030265808, + "rewards/accuracy_reward/std": 0.43853598833084106, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12473505735397339, + "step": 4404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 940.85498046875, + "completions/mean_terminated_length": 746.1600952148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9386820094827127, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13314206068965498, + "kl": 0.028472900390625, + "learning_rate": 1.1036006055533711e-07, + "loss": 0.0973, + "num_tokens": 2394181149.0, + "reward": 2.39453125, + "reward_std": 0.4445429742336273, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1372518539428711, + "step": 4405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 911.8281860351562, + "completions/mean_terminated_length": 679.7069702148438, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9388951041500186, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15228734164723715, + "kl": 0.027801513671875, + "learning_rate": 1.1028876267125905e-07, + "loss": 0.1019, + "num_tokens": 2394657616.0, + "reward": 2.396763563156128, + "reward_std": 0.3983593285083771, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.1276179403066635, + "step": 4406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 862.0670166015625, + "completions/mean_terminated_length": 706.33837890625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9391081988173245, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12942239639744188, + "kl": 0.029937744140625, + "learning_rate": 1.1021770813369378e-07, + "loss": 0.0699, + "num_tokens": 2395121918.0, + "reward": 2.513951063156128, + "reward_std": 0.3371692895889282, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1240563914179802, + "step": 4407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 973.6272583007812, + "completions/mean_terminated_length": 743.6124877929688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9393212934846306, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12884700790264783, + "kl": 0.026580810546875, + "learning_rate": 1.1014689698196463e-07, + "loss": 0.085, + "num_tokens": 2395635239.0, + "reward": 2.4620537757873535, + "reward_std": 0.3684464395046234, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11967316269874573, + "step": 4408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 907.0692138671875, + "completions/mean_terminated_length": 716.9140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9395343881519365, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1451302704816034, + "kl": 0.02947998046875, + "learning_rate": 1.1007632925526012e-07, + "loss": 0.1251, + "num_tokens": 2396108678.0, + "reward": 2.6082589626312256, + "reward_std": 0.45828431844711304, + "rewards/accuracy_reward/mean": 0.7075892686843872, + "rewards/accuracy_reward/std": 0.4553784728050232, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12758491933345795, + "step": 4409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 938.6004638671875, + "completions/mean_terminated_length": 760.40673828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9397474828192425, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13217111284181304, + "kl": 0.0284423828125, + "learning_rate": 1.1000600499263406e-07, + "loss": 0.0485, + "num_tokens": 2396593875.0, + "reward": 2.517857313156128, + "reward_std": 0.42520764470100403, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10013572871685028, + "step": 4410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1068.1629638671875, + "completions/mean_terminated_length": 821.8351440429688, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.9399605774865484, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14822661303540743, + "kl": 0.027587890625, + "learning_rate": 1.0993592423300561e-07, + "loss": 0.0781, + "num_tokens": 2397139148.0, + "reward": 2.3978796005249023, + "reward_std": 0.43711966276168823, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.14151519536972046, + "step": 4411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1006.8504638671875, + "completions/mean_terminated_length": 807.4813842773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9401736721538544, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1354165066113595, + "kl": 0.026153564453125, + "learning_rate": 1.0986608701515907e-07, + "loss": 0.1085, + "num_tokens": 2397652617.0, + "reward": 2.4838171005249023, + "reward_std": 0.4179857671260834, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.1442066878080368, + "step": 4412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 961.60498046875, + "completions/mean_terminated_length": 767.1973876953125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9403867668211603, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12511375976341238, + "kl": 0.029571533203125, + "learning_rate": 1.0979649337774394e-07, + "loss": 0.0594, + "num_tokens": 2398149816.0, + "reward": 2.4732143878936768, + "reward_std": 0.4018925428390503, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.11992326378822327, + "step": 4413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 987.8281860351562, + "completions/mean_terminated_length": 739.5785522460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9405998614884663, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13422537765455228, + "kl": 0.027008056640625, + "learning_rate": 1.0972714335927498e-07, + "loss": 0.1033, + "num_tokens": 2398659723.0, + "reward": 2.3738839626312256, + "reward_std": 0.3860425055027008, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.1421368569135666, + "step": 4414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 950.71435546875, + "completions/mean_terminated_length": 764.4908447265625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9408129561557722, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1313605382830849, + "kl": 0.02862548828125, + "learning_rate": 1.0965803699813223e-07, + "loss": 0.103, + "num_tokens": 2399148235.0, + "reward": 2.521763563156128, + "reward_std": 0.43817147612571716, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12086557596921921, + "step": 4415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 909.919677734375, + "completions/mean_terminated_length": 782.8386840820312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9410260508230781, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13797339735679653, + "kl": 0.029388427734375, + "learning_rate": 1.0958917433256066e-07, + "loss": 0.0568, + "num_tokens": 2399627815.0, + "reward": 2.5390625, + "reward_std": 0.42338475584983826, + "rewards/accuracy_reward/mean": 0.6316964030265808, + "rewards/accuracy_reward/std": 0.4828835427761078, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13455694913864136, + "step": 4416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1031.4107666015625, + "completions/mean_terminated_length": 839.95751953125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9412391454903841, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13067279555446829, + "kl": 0.02642822265625, + "learning_rate": 1.0952055540067057e-07, + "loss": 0.1179, + "num_tokens": 2400162511.0, + "reward": 2.4190850257873535, + "reward_std": 0.3631475269794464, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.1057254895567894, + "step": 4417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 930.4219360351562, + "completions/mean_terminated_length": 733.8923950195312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.94145224015769, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1276741451119799, + "kl": 0.028289794921875, + "learning_rate": 1.0945218024043721e-07, + "loss": 0.0126, + "num_tokens": 2400645708.0, + "reward": 2.4916296005249023, + "reward_std": 0.4471625089645386, + "rewards/accuracy_reward/mean": 0.5870535969734192, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.12517838180065155, + "step": 4418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 874.1094360351562, + "completions/mean_terminated_length": 696.0642700195312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.941665334824996, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13604941594691422, + "kl": 0.030029296875, + "learning_rate": 1.0938404888970096e-07, + "loss": 0.0312, + "num_tokens": 2401105997.0, + "reward": 2.4754464626312256, + "reward_std": 0.40643665194511414, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12337145954370499, + "step": 4419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1001.7344360351562, + "completions/mean_terminated_length": 811.2533569335938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9418784294923019, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12908360608501396, + "kl": 0.02691650390625, + "learning_rate": 1.0931616138616751e-07, + "loss": 0.0902, + "num_tokens": 2401625094.0, + "reward": 2.3895089626312256, + "reward_std": 0.4120877683162689, + "rewards/accuracy_reward/mean": 0.5069444179534912, + "rewards/accuracy_reward/std": 0.5005314350128174, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.14601868391036987, + "step": 4420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1046.0625, + "completions/mean_terminated_length": 828.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.942091524159608, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14533836784691892, + "kl": 0.02716064453125, + "learning_rate": 1.0924851776740713e-07, + "loss": 0.138, + "num_tokens": 2402162786.0, + "reward": 2.435826063156128, + "reward_std": 0.5159264802932739, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.16543777287006378, + "step": 4421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 939.8035888671875, + "completions/mean_terminated_length": 713.3978271484375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9423046188269139, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12571940291886324, + "kl": 0.028778076171875, + "learning_rate": 1.0918111807085558e-07, + "loss": 0.0599, + "num_tokens": 2402653130.0, + "reward": 2.4877233505249023, + "reward_std": 0.3626733124256134, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.12103716284036636, + "step": 4422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 961.7701416015625, + "completions/mean_terminated_length": 793.79638671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9425177134942198, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13940454937265723, + "kl": 0.029876708984375, + "learning_rate": 1.0911396233381338e-07, + "loss": 0.0804, + "num_tokens": 2403151843.0, + "reward": 2.5005581378936768, + "reward_std": 0.4199605882167816, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.4889315068721771, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1454136222600937, + "step": 4423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 850.8058471679688, + "completions/mean_terminated_length": 726.9581298828125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9427308081615258, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1336195718897701, + "kl": 0.03363037109375, + "learning_rate": 1.0904705059344605e-07, + "loss": 0.042, + "num_tokens": 2403601740.0, + "reward": 2.6138393878936768, + "reward_std": 0.32564234733581543, + "rewards/accuracy_reward/mean": 0.6852678656578064, + "rewards/accuracy_reward/std": 0.4649282693862915, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9910714030265808, + "rewards/tag_count_reward/std": 0.05962758511304855, + "step": 4424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 893.7188110351562, + "completions/mean_terminated_length": 732.1781005859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9429439028288317, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14153226418602613, + "kl": 0.031494140625, + "learning_rate": 1.0898038288678416e-07, + "loss": 0.052, + "num_tokens": 2404065374.0, + "reward": 2.4933037757873535, + "reward_std": 0.4588598310947418, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.1262323260307312, + "step": 4425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 940.013427734375, + "completions/mean_terminated_length": 745.1705932617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9431569974961377, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1173043467412791, + "kl": 0.026397705078125, + "learning_rate": 1.0891395925072313e-07, + "loss": 0.0666, + "num_tokens": 2404554628.0, + "reward": 2.5396206378936768, + "reward_std": 0.36521852016448975, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.083281509578228, + "step": 4426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 836.5960083007812, + "completions/mean_terminated_length": 704.660888671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9433700921634436, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.14746680020960345, + "kl": 0.03509521484375, + "learning_rate": 1.0884777972202346e-07, + "loss": 0.0899, + "num_tokens": 2404992703.0, + "reward": 2.482142925262451, + "reward_std": 0.3794863820075989, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14426834881305695, + "step": 4427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1055.919677734375, + "completions/mean_terminated_length": 813.4111328125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9435831868307496, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12311732086695507, + "kl": 0.02490234375, + "learning_rate": 1.0878184433731039e-07, + "loss": 0.0453, + "num_tokens": 2405543867.0, + "reward": 2.4760046005249023, + "reward_std": 0.4557977616786957, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12035840004682541, + "step": 4428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1018.669677734375, + "completions/mean_terminated_length": 811.6997680664062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9437962814980555, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12239655321029592, + "kl": 0.026123046875, + "learning_rate": 1.0871615313307402e-07, + "loss": 0.0743, + "num_tokens": 2406068583.0, + "reward": 2.5122768878936768, + "reward_std": 0.412568598985672, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11055814474821091, + "step": 4429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 969.0491333007812, + "completions/mean_terminated_length": 798.98193359375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9440093761653615, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12931178408718322, + "kl": 0.029632568359375, + "learning_rate": 1.0865070614566955e-07, + "loss": 0.034, + "num_tokens": 2406567677.0, + "reward": 2.486049175262451, + "reward_std": 0.4383724629878998, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12715734541416168, + "step": 4430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1059.5357666015625, + "completions/mean_terminated_length": 870.2553100585938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9442224708326674, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11520428219047466, + "kl": 0.026580810546875, + "learning_rate": 1.0858550341131687e-07, + "loss": 0.0438, + "num_tokens": 2407108749.0, + "reward": 2.467076063156128, + "reward_std": 0.4512147903442383, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12338031828403473, + "step": 4431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1065.5045166015625, + "completions/mean_terminated_length": 790.4057006835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9444355654999733, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.13900965080550057, + "kl": 0.029632568359375, + "learning_rate": 1.085205449661006e-07, + "loss": 0.0441, + "num_tokens": 2407658623.0, + "reward": 2.415736675262451, + "reward_std": 0.3632409870624542, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.126290425658226, + "step": 4432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 906.716552734375, + "completions/mean_terminated_length": 740.3401489257812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9446486601672793, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1360016041036316, + "kl": 0.029266357421875, + "learning_rate": 1.0845583084597026e-07, + "loss": 0.0969, + "num_tokens": 2408131488.0, + "reward": 2.446986675262451, + "reward_std": 0.37030893564224243, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9871651530265808, + "rewards/tag_count_reward/std": 0.08674707263708115, + "step": 4433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 992.1920166015625, + "completions/mean_terminated_length": 809.77490234375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9448617548345852, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1392144043731186, + "kl": 0.027862548828125, + "learning_rate": 1.0839136108674032e-07, + "loss": 0.0802, + "num_tokens": 2408642134.0, + "reward": 2.4056921005249023, + "reward_std": 0.4773463010787964, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.17171035706996918, + "step": 4434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 947.4866333007812, + "completions/mean_terminated_length": 733.2532958984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9450748495018912, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14749550931281832, + "kl": 0.032012939453125, + "learning_rate": 1.0832713572408976e-07, + "loss": 0.0854, + "num_tokens": 2409136000.0, + "reward": 2.540736675262451, + "reward_std": 0.37744495272636414, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.10893574357032776, + "step": 4435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 985.9420166015625, + "completions/mean_terminated_length": 762.0486450195312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9452879441691971, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12653013762864193, + "kl": 0.027740478515625, + "learning_rate": 1.0826315479356235e-07, + "loss": 0.0493, + "num_tokens": 2409645814.0, + "reward": 2.4771206378936768, + "reward_std": 0.42411619424819946, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12540756165981293, + "step": 4436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1097.0804443359375, + "completions/mean_terminated_length": 834.2905883789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9455010388365032, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1269239399175205, + "kl": 0.025299072265625, + "learning_rate": 1.081994183305667e-07, + "loss": 0.0905, + "num_tokens": 2410203690.0, + "reward": 2.3911831378936768, + "reward_std": 0.5035035014152527, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.16515076160430908, + "step": 4437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1023.79248046875, + "completions/mean_terminated_length": 780.472412109375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9457141335038091, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1223253508124097, + "kl": 0.02630615234375, + "learning_rate": 1.08135926370376e-07, + "loss": 0.0645, + "num_tokens": 2410733741.0, + "reward": 2.4246652126312256, + "reward_std": 0.3663933575153351, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 4438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 982.0647583007812, + "completions/mean_terminated_length": 771.1577758789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.945927228171115, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12817284210064, + "kl": 0.02850341796875, + "learning_rate": 1.0807267894812834e-07, + "loss": 0.1035, + "num_tokens": 2411237898.0, + "reward": 2.5, + "reward_std": 0.4459177851676941, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.1262323409318924, + "step": 4439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 897.3527221679688, + "completions/mean_terminated_length": 712.5336303710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.946140322838421, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13932298455989758, + "kl": 0.029632568359375, + "learning_rate": 1.080096760988262e-07, + "loss": 0.0462, + "num_tokens": 2411715304.0, + "reward": 2.4291296005249023, + "reward_std": 0.40668851137161255, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.135438933968544, + "step": 4440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1097.1429443359375, + "completions/mean_terminated_length": 858.1005249023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9463534175057269, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13866983919380696, + "kl": 0.02532958984375, + "learning_rate": 1.0794691785733684e-07, + "loss": 0.0617, + "num_tokens": 2412275272.0, + "reward": 2.34765625, + "reward_std": 0.5042898058891296, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1645980179309845, + "step": 4441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1002.7813110351562, + "completions/mean_terminated_length": 740.0167236328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9465665121730329, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1415494046134373, + "kl": 0.027496337890625, + "learning_rate": 1.0788440425839215e-07, + "loss": 0.0605, + "num_tokens": 2412789926.0, + "reward": 2.407924175262451, + "reward_std": 0.46174025535583496, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.12665565311908722, + "step": 4442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1046.875, + "completions/mean_terminated_length": 784.6083984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9467796068403388, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12110601505399457, + "kl": 0.025726318359375, + "learning_rate": 1.0782213533658867e-07, + "loss": 0.0699, + "num_tokens": 2413324542.0, + "reward": 2.2371652126312256, + "reward_std": 0.4158909022808075, + "rewards/accuracy_reward/mean": 0.3571428656578064, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15604011714458466, + "step": 4443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 951.79248046875, + "completions/mean_terminated_length": 759.02099609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9469927015076448, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11757115916704247, + "kl": 0.027740478515625, + "learning_rate": 1.0776011112638747e-07, + "loss": 0.0267, + "num_tokens": 2413814769.0, + "reward": 2.494419813156128, + "reward_std": 0.4215051829814911, + "rewards/accuracy_reward/mean": 0.6157407164573669, + "rewards/accuracy_reward/std": 0.48698362708091736, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13399910926818848, + "step": 4444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1048.1585693359375, + "completions/mean_terminated_length": 803.7528076171875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9472057961749507, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12401392856837996, + "kl": 0.027130126953125, + "learning_rate": 1.0769833166211414e-07, + "loss": 0.0911, + "num_tokens": 2414349592.0, + "reward": 2.4838171005249023, + "reward_std": 0.46600082516670227, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489606618881226, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16142748296260834, + "step": 4445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 932.9241333007812, + "completions/mean_terminated_length": 767.0923461914062, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.9474188908422567, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12514265046791356, + "kl": 0.030120849609375, + "learning_rate": 1.0763679697795899e-07, + "loss": 0.0829, + "num_tokens": 2414832646.0, + "reward": 2.5228796005249023, + "reward_std": 0.41759487986564636, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1463720053434372, + "step": 4446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1076.19873046875, + "completions/mean_terminated_length": 880.7962646484375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9476319855095626, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1266694755001441, + "kl": 0.025390625, + "learning_rate": 1.0757550710797668e-07, + "loss": 0.0861, + "num_tokens": 2415388879.0, + "reward": 2.4654018878936768, + "reward_std": 0.44824090600013733, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.13444557785987854, + "step": 4447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 961.404052734375, + "completions/mean_terminated_length": 780.3046875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9478450801768685, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1337750738774125, + "kl": 0.030426025390625, + "learning_rate": 1.0751446208608642e-07, + "loss": 0.0762, + "num_tokens": 2415886628.0, + "reward": 2.4854912757873535, + "reward_std": 0.450486421585083, + "rewards/accuracy_reward/mean": 0.6183035969734192, + "rewards/accuracy_reward/std": 0.4863457977771759, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.15645259618759155, + "step": 4448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 963.3638916015625, + "completions/mean_terminated_length": 785.8779296875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9480581748441745, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12293067983244339, + "kl": 0.027679443359375, + "learning_rate": 1.0745366194607203e-07, + "loss": 0.0465, + "num_tokens": 2416383463.0, + "reward": 2.493861675262451, + "reward_std": 0.4035743176937103, + "rewards/accuracy_reward/mean": 0.5647321343421936, + "rewards/accuracy_reward/std": 0.49634629487991333, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12892211973667145, + "step": 4449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 896.24560546875, + "completions/mean_terminated_length": 777.0985107421875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9482712695114804, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13178409186063167, + "kl": 0.030029296875, + "learning_rate": 1.0739310672158174e-07, + "loss": 0.0473, + "num_tokens": 2416858341.0, + "reward": 2.5234375, + "reward_std": 0.4131523370742798, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.107582226395607, + "step": 4450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 904.5156860351562, + "completions/mean_terminated_length": 731.082275390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9484843641787865, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12929407369907378, + "kl": 0.0291748046875, + "learning_rate": 1.0733279644612822e-07, + "loss": 0.0614, + "num_tokens": 2417332060.0, + "reward": 2.4659600257873535, + "reward_std": 0.3387841284275055, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.11734377592802048, + "step": 4451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 986.6473388671875, + "completions/mean_terminated_length": 734.5028076171875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9486974588460924, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1429625872946558, + "kl": 0.027313232421875, + "learning_rate": 1.072727311530886e-07, + "loss": 0.1122, + "num_tokens": 2417850686.0, + "reward": 2.36328125, + "reward_std": 0.46211400628089905, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12493880838155746, + "step": 4452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1048.9554443359375, + "completions/mean_terminated_length": 811.61328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9489105535133984, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11739460857062924, + "kl": 0.027130126953125, + "learning_rate": 1.0721291087570435e-07, + "loss": 0.0429, + "num_tokens": 2418388474.0, + "reward": 2.3839287757873535, + "reward_std": 0.3981561064720154, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14187753200531006, + "step": 4453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1015.544677734375, + "completions/mean_terminated_length": 787.6730346679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9491236481807043, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12153895420205191, + "kl": 0.02630615234375, + "learning_rate": 1.0715333564708152e-07, + "loss": 0.0686, + "num_tokens": 2418912398.0, + "reward": 2.4380581378936768, + "reward_std": 0.3482321500778198, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1046096533536911, + "step": 4454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1036.857177734375, + "completions/mean_terminated_length": 840.0213012695312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9493367428480102, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11971476566126323, + "kl": 0.028228759765625, + "learning_rate": 1.0709400550019032e-07, + "loss": 0.0821, + "num_tokens": 2419444958.0, + "reward": 2.4620537757873535, + "reward_std": 0.41845688223838806, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15274205803871155, + "step": 4455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 913.216552734375, + "completions/mean_terminated_length": 717.1544799804688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9495498375153162, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.139177705039715, + "kl": 0.029571533203125, + "learning_rate": 1.0703492046786555e-07, + "loss": 0.1246, + "num_tokens": 2419915903.0, + "reward": 2.5033483505249023, + "reward_std": 0.3002256155014038, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13399912416934967, + "step": 4456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 948.2567138671875, + "completions/mean_terminated_length": 727.1287231445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9497629321826221, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1556627199468159, + "kl": 0.029296875, + "learning_rate": 1.0697608058280621e-07, + "loss": 0.1149, + "num_tokens": 2420415298.0, + "reward": 2.4458706378936768, + "reward_std": 0.4268389940261841, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14706972241401672, + "step": 4457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 951.2076416015625, + "completions/mean_terminated_length": 768.4088745117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9499760268499281, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12180670371396651, + "kl": 0.03057861328125, + "learning_rate": 1.0691748587757567e-07, + "loss": 0.0728, + "num_tokens": 2420913279.0, + "reward": 2.4402902126312256, + "reward_std": 0.3962782025337219, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11638234555721283, + "step": 4458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 954.1964721679688, + "completions/mean_terminated_length": 775.2103881835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.950189121517234, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1503474504749525, + "kl": 0.028472900390625, + "learning_rate": 1.0685913638460168e-07, + "loss": 0.0674, + "num_tokens": 2421417159.0, + "reward": 2.4151787757873535, + "reward_std": 0.44619113206863403, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.15324795246124268, + "step": 4459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1013.57373046875, + "completions/mean_terminated_length": 802.2392578125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.95040221618454, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12767281938620414, + "kl": 0.027099609375, + "learning_rate": 1.0680103213617606e-07, + "loss": 0.0864, + "num_tokens": 2421941384.0, + "reward": 2.435267925262451, + "reward_std": 0.4095615744590759, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11036036163568497, + "step": 4460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1029.5670166015625, + "completions/mean_terminated_length": 801.3934326171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9506153108518459, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12178717088203075, + "kl": 0.024810791015625, + "learning_rate": 1.0674317316445523e-07, + "loss": 0.0858, + "num_tokens": 2422471910.0, + "reward": 2.5323662757873535, + "reward_std": 0.43594661355018616, + "rewards/accuracy_reward/mean": 0.6361607313156128, + "rewards/accuracy_reward/std": 0.4816409945487976, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.13763901591300964, + "step": 4461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 907.7500610351562, + "completions/mean_terminated_length": 731.4226684570312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9508284055191519, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1232258081316246, + "kl": 0.029327392578125, + "learning_rate": 1.0668555950145965e-07, + "loss": 0.0908, + "num_tokens": 2422941942.0, + "reward": 2.4972100257873535, + "reward_std": 0.43454083800315857, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13669590651988983, + "step": 4462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 869.1451416015625, + "completions/mean_terminated_length": 743.9827270507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9510415001864578, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14109911443051323, + "kl": 0.02984619140625, + "learning_rate": 1.0662819117907403e-07, + "loss": 0.0824, + "num_tokens": 2423403671.0, + "reward": 2.650669813156128, + "reward_std": 0.39399418234825134, + "rewards/accuracy_reward/mean": 0.7388392686843872, + "rewards/accuracy_reward/std": 0.43975841999053955, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1292569637298584, + "step": 4463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1094.2835693359375, + "completions/mean_terminated_length": 820.2269897460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9512545948537637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18726304293999432, + "kl": 0.029937744140625, + "learning_rate": 1.0657106822904735e-07, + "loss": 0.0525, + "num_tokens": 2423970918.0, + "reward": 2.2845983505249023, + "reward_std": 0.5178691744804382, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.494759202003479, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9564732313156128, + "rewards/tag_count_reward/std": 0.16317066550254822, + "step": 4464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1005.6406860351562, + "completions/mean_terminated_length": 782.4796752929688, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9514676895210697, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13345664224154202, + "kl": 0.02630615234375, + "learning_rate": 1.0651419068299287e-07, + "loss": 0.0794, + "num_tokens": 2424488405.0, + "reward": 2.4654018878936768, + "reward_std": 0.4529860019683838, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1569942682981491, + "step": 4465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 928.6027221679688, + "completions/mean_terminated_length": 707.11767578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9516807841883757, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13083560485816573, + "kl": 0.0272216796875, + "learning_rate": 1.0645755857238787e-07, + "loss": 0.0541, + "num_tokens": 2424973811.0, + "reward": 2.533482313156128, + "reward_std": 0.37343981862068176, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11889871954917908, + "step": 4466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 991.0313110351562, + "completions/mean_terminated_length": 791.9734497070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9518938788556817, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1230067664681598, + "kl": 0.027252197265625, + "learning_rate": 1.064011719285739e-07, + "loss": 0.0955, + "num_tokens": 2425487745.0, + "reward": 2.415736675262451, + "reward_std": 0.4372251629829407, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16573180258274078, + "step": 4467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 914.779052734375, + "completions/mean_terminated_length": 729.3428344726562, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9521069735229876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11563039121539463, + "kl": 0.0294189453125, + "learning_rate": 1.0634503078275669e-07, + "loss": 0.0567, + "num_tokens": 2425970238.0, + "reward": 2.5200893878936768, + "reward_std": 0.34107157588005066, + "rewards/accuracy_reward/mean": 0.6111111044883728, + "rewards/accuracy_reward/std": 0.4880632162094116, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.09730303287506104, + "step": 4468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1080.65625, + "completions/mean_terminated_length": 844.1944580078125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9523200681902936, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1144333209386932, + "kl": 0.02490234375, + "learning_rate": 1.0628913516600608e-07, + "loss": 0.0603, + "num_tokens": 2426525508.0, + "reward": 2.412388563156128, + "reward_std": 0.40310806035995483, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.09746808558702469, + "step": 4469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 910.8303833007812, + "completions/mean_terminated_length": 728.1761474609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9525331628575995, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1306880959362473, + "kl": 0.0303955078125, + "learning_rate": 1.0623348510925593e-07, + "loss": 0.1337, + "num_tokens": 2426995304.0, + "reward": 2.5887277126312256, + "reward_std": 0.4283883273601532, + "rewards/accuracy_reward/mean": 0.6897321343421936, + "rewards/accuracy_reward/std": 0.46312037110328674, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13927440345287323, + "step": 4470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1007.4688110351562, + "completions/mean_terminated_length": 784.69921875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9527462575249055, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15743305872484828, + "kl": 0.02874755859375, + "learning_rate": 1.0617808064330438e-07, + "loss": 0.1309, + "num_tokens": 2427527258.0, + "reward": 2.271763563156128, + "reward_std": 0.4553113281726837, + "rewards/accuracy_reward/mean": 0.3816964328289032, + "rewards/accuracy_reward/std": 0.4863457679748535, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14125031232833862, + "step": 4471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 957.0692138671875, + "completions/mean_terminated_length": 765.2257080078125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9529593521922114, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1390581248103235, + "kl": 0.0286865234375, + "learning_rate": 1.0612292179881346e-07, + "loss": 0.1018, + "num_tokens": 2428030585.0, + "reward": 2.46875, + "reward_std": 0.4433434009552002, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.10992966592311859, + "step": 4472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 901.83935546875, + "completions/mean_terminated_length": 734.7518920898438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9531724468595173, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14149662350534029, + "kl": 0.02825927734375, + "learning_rate": 1.0606800860630954e-07, + "loss": 0.1067, + "num_tokens": 2428504129.0, + "reward": 2.4308037757873535, + "reward_std": 0.42274850606918335, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11119430512189865, + "step": 4473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 935.5000610351562, + "completions/mean_terminated_length": 732.96044921875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9533855415268233, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12986227710193882, + "kl": 0.02935791015625, + "learning_rate": 1.0601334109618267e-07, + "loss": 0.1072, + "num_tokens": 2428988449.0, + "reward": 2.498326063156128, + "reward_std": 0.4107213616371155, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15853238105773926, + "step": 4474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1015.8817138671875, + "completions/mean_terminated_length": 749.1544799804688, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9535986361941292, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14775375946306477, + "kl": 0.0277099609375, + "learning_rate": 1.0595891929868723e-07, + "loss": 0.0765, + "num_tokens": 2429511724.0, + "reward": 2.2818081378936768, + "reward_std": 0.47862571477890015, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.15853238105773926, + "step": 4475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1117.29248046875, + "completions/mean_terminated_length": 863.4630737304688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9538117308614352, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11815939336230749, + "kl": 0.023529052734375, + "learning_rate": 1.0590474324394155e-07, + "loss": 0.0453, + "num_tokens": 2430078079.0, + "reward": 2.4051339626312256, + "reward_std": 0.3865298628807068, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12062390148639679, + "step": 4476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1102.2835693359375, + "completions/mean_terminated_length": 834.0143432617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9540248255287411, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11567708777955636, + "kl": 0.024261474609375, + "learning_rate": 1.0585081296192788e-07, + "loss": 0.0699, + "num_tokens": 2430648110.0, + "reward": 2.427455425262451, + "reward_std": 0.41465842723846436, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12292034178972244, + "step": 4477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 953.9375610351562, + "completions/mean_terminated_length": 716.0978393554688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9542379201960471, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 5.333851376858454, + "kl": 0.1162109375, + "learning_rate": 1.057971284824925e-07, + "loss": 0.0705, + "num_tokens": 2431143938.0, + "reward": 2.416294813156128, + "reward_std": 0.338835209608078, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9363839030265808, + "rewards/tag_count_reward/std": 0.2202649563550949, + "step": 4478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1085.180908203125, + "completions/mean_terminated_length": 812.0601806640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.954451014863353, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12293688416830006, + "kl": 0.02545166015625, + "learning_rate": 1.0574368983534565e-07, + "loss": 0.0469, + "num_tokens": 2431702323.0, + "reward": 2.2901787757873535, + "reward_std": 0.42563074827194214, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13583585619926453, + "step": 4479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1118.274658203125, + "completions/mean_terminated_length": 854.5415649414062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9546641095306589, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12656611165390563, + "kl": 0.025299072265625, + "learning_rate": 1.0569049705006161e-07, + "loss": 0.1042, + "num_tokens": 2432273230.0, + "reward": 2.314174175262451, + "reward_std": 0.49043822288513184, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14611589908599854, + "step": 4480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 962.99560546875, + "completions/mean_terminated_length": 768.8368530273438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.954877204197965, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14755371311635543, + "kl": 0.0294189453125, + "learning_rate": 1.0563755015607845e-07, + "loss": 0.0923, + "num_tokens": 2432774860.0, + "reward": 2.377232313156128, + "reward_std": 0.5108281970024109, + "rewards/accuracy_reward/mean": 0.4910714328289032, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14955389499664307, + "step": 4481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 911.8326416015625, + "completions/mean_terminated_length": 697.859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9550902988652709, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.19361545496792237, + "kl": 0.03546142578125, + "learning_rate": 1.0558484918269823e-07, + "loss": 0.0422, + "num_tokens": 2433256801.0, + "reward": 2.497767925262451, + "reward_std": 0.4049406945705414, + "rewards/accuracy_reward/mean": 0.6319444179534912, + "rewards/accuracy_reward/std": 0.48283568024635315, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 4482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1045.9888916015625, + "completions/mean_terminated_length": 814.7554931640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9553033935325769, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12363393297317277, + "kl": 0.025543212890625, + "learning_rate": 1.0553239415908689e-07, + "loss": 0.0794, + "num_tokens": 2433795500.0, + "reward": 2.3359375, + "reward_std": 0.4033602476119995, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1502326875925064, + "step": 4483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1055.8616943359375, + "completions/mean_terminated_length": 846.7081298828125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.9555164881998828, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12489362531603199, + "kl": 0.027252197265625, + "learning_rate": 1.0548018511427429e-07, + "loss": 0.0596, + "num_tokens": 2434333662.0, + "reward": 2.451451063156128, + "reward_std": 0.4044599235057831, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14299827814102173, + "step": 4484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1065.9732666015625, + "completions/mean_terminated_length": 808.7098388671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9557295828671888, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1150258814940669, + "kl": 0.024627685546875, + "learning_rate": 1.0542822207715416e-07, + "loss": 0.0659, + "num_tokens": 2434879458.0, + "reward": 2.419642925262451, + "reward_std": 0.4273401200771332, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14530304074287415, + "step": 4485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 957.2098388671875, + "completions/mean_terminated_length": 758.6227416992188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9559426775344947, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13386721870679222, + "kl": 0.026763916015625, + "learning_rate": 1.0537650507648399e-07, + "loss": 0.0814, + "num_tokens": 2435374576.0, + "reward": 2.459263563156128, + "reward_std": 0.404101699590683, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12130890041589737, + "step": 4486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 907.3928833007812, + "completions/mean_terminated_length": 737.7640991210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9561557722018007, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13229215692504598, + "kl": 0.030059814453125, + "learning_rate": 1.0532503414088523e-07, + "loss": 0.0638, + "num_tokens": 2435842704.0, + "reward": 2.4246652126312256, + "reward_std": 0.3579663038253784, + "rewards/accuracy_reward/mean": 0.4977678656578064, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.12151455134153366, + "step": 4487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 975.16748046875, + "completions/mean_terminated_length": 786.5065307617188, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9563688668691066, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12698549637986675, + "kl": 0.028289794921875, + "learning_rate": 1.0527380929884322e-07, + "loss": 0.0784, + "num_tokens": 2436343051.0, + "reward": 2.560826063156128, + "reward_std": 0.4783167839050293, + "rewards/accuracy_reward/mean": 0.6763392686843872, + "rewards/accuracy_reward/std": 0.46839532256126404, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.14097605645656586, + "step": 4488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 992.4063110351562, + "completions/mean_terminated_length": 780.155517578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9565819615364125, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13632784793579203, + "kl": 0.02777099609375, + "learning_rate": 1.0522283057870675e-07, + "loss": 0.0886, + "num_tokens": 2436859585.0, + "reward": 2.4330358505249023, + "reward_std": 0.3583570420742035, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9866071343421936, + "rewards/tag_count_reward/std": 0.08746545761823654, + "step": 4489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1001.0045166015625, + "completions/mean_terminated_length": 823.31591796875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.9567950562037185, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12344958373449007, + "kl": 0.030548095703125, + "learning_rate": 1.0517209800868883e-07, + "loss": 0.0588, + "num_tokens": 2437375603.0, + "reward": 2.533482313156128, + "reward_std": 0.4051283001899719, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12083587795495987, + "step": 4490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 975.4129638671875, + "completions/mean_terminated_length": 783.4763793945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9570081508710244, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14000978165755174, + "kl": 0.029998779296875, + "learning_rate": 1.0512161161686593e-07, + "loss": 0.0901, + "num_tokens": 2437877644.0, + "reward": 2.3989956378936768, + "reward_std": 0.4502313733100891, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15647155046463013, + "step": 4491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 951.607177734375, + "completions/mean_terminated_length": 765.5352783203125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.9572212455383304, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14111165238033374, + "kl": 0.028350830078125, + "learning_rate": 1.0507137143117852e-07, + "loss": 0.0757, + "num_tokens": 2438369740.0, + "reward": 2.493861675262451, + "reward_std": 0.438626229763031, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561823636293411, + "step": 4492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1108.372802734375, + "completions/mean_terminated_length": 831.372802734375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.9574343402056363, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1224551444227295, + "kl": 0.024139404296875, + "learning_rate": 1.0502137747943063e-07, + "loss": 0.0831, + "num_tokens": 2438944099.0, + "reward": 2.3604912757873535, + "reward_std": 0.4305449426174164, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1396559476852417, + "step": 4493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1040.2388916015625, + "completions/mean_terminated_length": 769.0283203125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9576474348729423, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.5186555658797078, + "kl": 0.049163818359375, + "learning_rate": 1.0497162978929006e-07, + "loss": 0.1269, + "num_tokens": 2439488270.0, + "reward": 2.4056921005249023, + "reward_std": 0.4129801094532013, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.500314474105835, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14469929039478302, + "step": 4494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 992.5692138671875, + "completions/mean_terminated_length": 816.6640625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9578605295402483, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12198193137182425, + "kl": 0.027069091796875, + "learning_rate": 1.0492212838828843e-07, + "loss": 0.113, + "num_tokens": 2439995533.0, + "reward": 2.560267925262451, + "reward_std": 0.4273020029067993, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11849905550479889, + "step": 4495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1023.357177734375, + "completions/mean_terminated_length": 817.3297729492188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9580736242075542, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13344019344455174, + "kl": 0.029571533203125, + "learning_rate": 1.048728733038209e-07, + "loss": 0.1146, + "num_tokens": 2440522221.0, + "reward": 2.4330358505249023, + "reward_std": 0.49730271100997925, + "rewards/accuracy_reward/mean": 0.5803571343421936, + "rewards/accuracy_reward/std": 0.4940521717071533, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9441964030265808, + "rewards/tag_count_reward/std": 0.1875898689031601, + "step": 4496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 972.7053833007812, + "completions/mean_terminated_length": 773.5767211914062, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.9582867188748602, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1315559309453804, + "kl": 0.027435302734375, + "learning_rate": 1.0482386456314645e-07, + "loss": 0.0977, + "num_tokens": 2441017673.0, + "reward": 2.50390625, + "reward_std": 0.4680311679840088, + "rewards/accuracy_reward/mean": 0.6383928656578064, + "rewards/accuracy_reward/std": 0.4810029864311218, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.29793688654899597, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.14748506247997284, + "step": 4497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1062.8170166015625, + "completions/mean_terminated_length": 864.723876953125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9584998135421661, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13636840728692867, + "kl": 0.027801513671875, + "learning_rate": 1.0477510219338761e-07, + "loss": 0.0787, + "num_tokens": 2441557687.0, + "reward": 2.5066964626312256, + "reward_std": 0.43101081252098083, + "rewards/accuracy_reward/mean": 0.6049107313156128, + "rewards/accuracy_reward/std": 0.4894163906574249, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1457662582397461, + "step": 4498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1089.060302734375, + "completions/mean_terminated_length": 864.51513671875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9587129082094721, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12927991916379877, + "kl": 0.0260009765625, + "learning_rate": 1.0472658622153066e-07, + "loss": 0.1102, + "num_tokens": 2442123698.0, + "reward": 2.321986675262451, + "reward_std": 0.4250272512435913, + "rewards/accuracy_reward/mean": 0.4352678656578064, + "rewards/accuracy_reward/std": 0.4963463246822357, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13180458545684814, + "step": 4499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 968.1339721679688, + "completions/mean_terminated_length": 744.0107421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.958926002876778, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11612721789367826, + "kl": 0.029144287109375, + "learning_rate": 1.0467831667442545e-07, + "loss": 0.0326, + "num_tokens": 2442635166.0, + "reward": 2.3956475257873535, + "reward_std": 0.3881060779094696, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.14102916419506073, + "step": 4500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1045.9398193359375, + "completions/mean_terminated_length": 794.025146484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.959139097544084, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12721827847348982, + "kl": 0.0238037109375, + "learning_rate": 1.0463029357878548e-07, + "loss": 0.0459, + "num_tokens": 2443178883.0, + "reward": 2.34375, + "reward_std": 0.43537646532058716, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9107142686843872, + "rewards/format_reward/std": 0.2854745090007782, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090012550354004, + "step": 4501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 974.7188110351562, + "completions/mean_terminated_length": 775.9629516601562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9593521922113899, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13129852377983098, + "kl": 0.0286865234375, + "learning_rate": 1.045825169611879e-07, + "loss": 0.0786, + "num_tokens": 2443681925.0, + "reward": 2.330357313156128, + "reward_std": 0.44588029384613037, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14619384706020355, + "step": 4502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 886.0402221679688, + "completions/mean_terminated_length": 713.2359008789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9595652868786959, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1428843302307251, + "kl": 0.030853271484375, + "learning_rate": 1.0453498684807333e-07, + "loss": 0.0566, + "num_tokens": 2444149207.0, + "reward": 2.478236675262451, + "reward_std": 0.42918145656585693, + "rewards/accuracy_reward/mean": 0.5925925970077515, + "rewards/accuracy_reward/std": 0.49192148447036743, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.1217813789844513, + "step": 4503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1014.794677734375, + "completions/mean_terminated_length": 807.0455932617188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9597783815460018, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12925826194484052, + "kl": 0.02777099609375, + "learning_rate": 1.0448770326574616e-07, + "loss": 0.1167, + "num_tokens": 2444683291.0, + "reward": 2.3699777126312256, + "reward_std": 0.42711836099624634, + "rewards/accuracy_reward/mean": 0.4821428656578064, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14661914110183716, + "step": 4504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1010.40185546875, + "completions/mean_terminated_length": 801.7694702148438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9599914762133077, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 2.022418570712354, + "kl": 0.09954833984375, + "learning_rate": 1.0444066624037418e-07, + "loss": 0.0849, + "num_tokens": 2445209583.0, + "reward": 2.380580425262451, + "reward_std": 0.4589422345161438, + "rewards/accuracy_reward/mean": 0.5066964030265808, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.14259286224842072, + "step": 4505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 906.6295166015625, + "completions/mean_terminated_length": 712.92431640625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9602045708806137, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14450923763817577, + "kl": 0.029632568359375, + "learning_rate": 1.0439387579798868e-07, + "loss": 0.1002, + "num_tokens": 2445683753.0, + "reward": 2.4068081378936768, + "reward_std": 0.3942721486091614, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5005589723587036, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.11993236839771271, + "step": 4506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1208.665283203125, + "completions/mean_terminated_length": 898.0856323242188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9604176655479196, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.18833848354670926, + "kl": 0.02337646484375, + "learning_rate": 1.0434733196448481e-07, + "loss": 0.0444, + "num_tokens": 2446300323.0, + "reward": 2.3387277126312256, + "reward_std": 0.44807061553001404, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10346972197294235, + "step": 4507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 941.7076416015625, + "completions/mean_terminated_length": 802.7261352539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9606307602152256, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1307990702578527, + "kl": 0.0299072265625, + "learning_rate": 1.0430103476562078e-07, + "loss": 0.0601, + "num_tokens": 2446789584.0, + "reward": 2.4927456378936768, + "reward_std": 0.3784714639186859, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1091761440038681, + "step": 4508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 930.9241333007812, + "completions/mean_terminated_length": 768.0767211914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9608438548825315, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.14240070136879604, + "kl": 0.02880859375, + "learning_rate": 1.0425498422701872e-07, + "loss": 0.0569, + "num_tokens": 2447281710.0, + "reward": 2.4910714626312256, + "reward_std": 0.3062003254890442, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493200302124, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09024729579687119, + "step": 4509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 949.9598388671875, + "completions/mean_terminated_length": 766.953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9610569495498376, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1283643509528888, + "kl": 0.030670166015625, + "learning_rate": 1.0420918037416405e-07, + "loss": 0.0883, + "num_tokens": 2447778604.0, + "reward": 2.4760046005249023, + "reward_std": 0.36500662565231323, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10681798309087753, + "step": 4510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1032.4107666015625, + "completions/mean_terminated_length": 808.2615966796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9612700442171435, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11244210563867672, + "kl": 0.025177001953125, + "learning_rate": 1.0416362323240563e-07, + "loss": 0.0542, + "num_tokens": 2448316484.0, + "reward": 2.3917412757873535, + "reward_std": 0.37942928075790405, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1302192062139511, + "step": 4511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 951.0558471679688, + "completions/mean_terminated_length": 781.4252319335938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9614831388844494, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12907304274929046, + "kl": 0.029052734375, + "learning_rate": 1.041183128269559e-07, + "loss": 0.0689, + "num_tokens": 2448815437.0, + "reward": 2.4068081378936768, + "reward_std": 0.4706929326057434, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9536830186843872, + "rewards/tag_count_reward/std": 0.17443691194057465, + "step": 4512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1030.294677734375, + "completions/mean_terminated_length": 863.7610473632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9616962335517554, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11913687669526882, + "kl": 0.027069091796875, + "learning_rate": 1.0407324918289062e-07, + "loss": 0.0948, + "num_tokens": 2449344033.0, + "reward": 2.4603796005249023, + "reward_std": 0.33730754256248474, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.0952264592051506, + "step": 4513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1102.3460693359375, + "completions/mean_terminated_length": 899.888916015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9619093282190613, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11906813841396985, + "kl": 0.024261474609375, + "learning_rate": 1.0402843232514919e-07, + "loss": 0.0814, + "num_tokens": 2449915660.0, + "reward": 2.24609375, + "reward_std": 0.4485597610473633, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13485698401927948, + "step": 4514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 989.3906860351562, + "completions/mean_terminated_length": 759.2581787109375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9621224228863673, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14435496849812327, + "kl": 0.027587890625, + "learning_rate": 1.0398386227853424e-07, + "loss": 0.1196, + "num_tokens": 2450422379.0, + "reward": 2.510044813156128, + "reward_std": 0.47749942541122437, + "rewards/accuracy_reward/mean": 0.6388888955116272, + "rewards/accuracy_reward/std": 0.480879545211792, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.14819122850894928, + "step": 4515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 900.8839721679688, + "completions/mean_terminated_length": 723.4948120117188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9623355175536732, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12876539241636106, + "kl": 0.0308837890625, + "learning_rate": 1.039395390677119e-07, + "loss": 0.0433, + "num_tokens": 2450891415.0, + "reward": 2.4927456378936768, + "reward_std": 0.3339018225669861, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9860491156578064, + "rewards/tag_count_reward/std": 0.08657421171665192, + "step": 4516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1130.669677734375, + "completions/mean_terminated_length": 893.6067504882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9625486122209792, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12696968904337438, + "kl": 0.02569580078125, + "learning_rate": 1.0389546271721169e-07, + "loss": 0.0441, + "num_tokens": 2451475907.0, + "reward": 2.3387277126312256, + "reward_std": 0.39402276277542114, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1226801648736, + "step": 4517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 937.7522583007812, + "completions/mean_terminated_length": 762.751953125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9627617068882851, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12296301672335991, + "kl": 0.02825927734375, + "learning_rate": 1.0385163325142645e-07, + "loss": 0.0726, + "num_tokens": 2451960356.0, + "reward": 2.3934152126312256, + "reward_std": 0.3388652205467224, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174957811832428, + "rewards/tag_count_reward/mean": 0.9827008843421936, + "rewards/tag_count_reward/std": 0.10230488330125809, + "step": 4518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 1121.82373046875, + "completions/mean_terminated_length": 892.2144775390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9629748015555911, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13172194446205926, + "kl": 0.025299072265625, + "learning_rate": 1.0380805069461247e-07, + "loss": 0.0988, + "num_tokens": 2452531861.0, + "reward": 2.279576063156128, + "reward_std": 0.5081182718276978, + "rewards/accuracy_reward/mean": 0.43287035822868347, + "rewards/accuracy_reward/std": 0.4960475564002991, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.18185119330883026, + "step": 4519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 928.77685546875, + "completions/mean_terminated_length": 738.830322265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.963187896222897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1395025307125469, + "kl": 0.029632568359375, + "learning_rate": 1.0376471507088933e-07, + "loss": 0.0692, + "num_tokens": 2453017873.0, + "reward": 2.5167412757873535, + "reward_std": 0.3990950882434845, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9832589030265808, + "rewards/tag_count_reward/std": 0.1057564839720726, + "step": 4520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1051.6898193359375, + "completions/mean_terminated_length": 851.3592529296875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9634009908902029, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11776002520932452, + "kl": 0.025543212890625, + "learning_rate": 1.0372162640423998e-07, + "loss": 0.0689, + "num_tokens": 2453568214.0, + "reward": 2.345424175262451, + "reward_std": 0.3817687928676605, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.15223345160484314, + "step": 4521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 992.3638916015625, + "completions/mean_terminated_length": 783.4946899414062, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.9636140855575089, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12764886347682394, + "kl": 0.02728271484375, + "learning_rate": 1.0367878471851078e-07, + "loss": 0.104, + "num_tokens": 2454079337.0, + "reward": 2.5619421005249023, + "reward_std": 0.3644016683101654, + "rewards/accuracy_reward/mean": 0.6428571343421936, + "rewards/accuracy_reward/std": 0.47969308495521545, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11170817166566849, + "step": 4522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 957.63623046875, + "completions/mean_terminated_length": 775.9088745117188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9638271802248148, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.16981841400533865, + "kl": 0.02850341796875, + "learning_rate": 1.0363619003741125e-07, + "loss": 0.0551, + "num_tokens": 2454584358.0, + "reward": 2.3588171005249023, + "reward_std": 0.403976172208786, + "rewards/accuracy_reward/mean": 0.4598214328289032, + "rewards/accuracy_reward/std": 0.49894019961357117, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12201692909002304, + "step": 4523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 918.8772583007812, + "completions/mean_terminated_length": 826.1473388671875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9640402748921209, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1487468950018143, + "kl": 0.0311279296875, + "learning_rate": 1.0359384238451425e-07, + "loss": 0.0589, + "num_tokens": 2455070847.0, + "reward": 2.420201063156128, + "reward_std": 0.4080529510974884, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.141964390873909, + "step": 4524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1121.3504638671875, + "completions/mean_terminated_length": 834.143310546875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9642533695594268, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12331570402037917, + "kl": 0.022674560546875, + "learning_rate": 1.03551741783256e-07, + "loss": 0.0639, + "num_tokens": 2455648572.0, + "reward": 2.234375, + "reward_std": 0.41747739911079407, + "rewards/accuracy_reward/mean": 0.3472222089767456, + "rewards/accuracy_reward/std": 0.47663912177085876, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 4525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1063.8148193359375, + "completions/mean_terminated_length": 826.6287841796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9644664642267328, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13160024643926505, + "kl": 0.02655029296875, + "learning_rate": 1.0350988825693606e-07, + "loss": 0.0878, + "num_tokens": 2456197513.0, + "reward": 2.361607313156128, + "reward_std": 0.44072797894477844, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.12312835454940796, + "step": 4526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 981.07373046875, + "completions/mean_terminated_length": 745.593994140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9646795588940387, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13488917479080317, + "kl": 0.028106689453125, + "learning_rate": 1.0346828182871701e-07, + "loss": 0.1232, + "num_tokens": 2456700458.0, + "reward": 2.4614956378936768, + "reward_std": 0.4898920953273773, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16087746620178223, + "step": 4527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 988.1652221679688, + "completions/mean_terminated_length": 778.4652709960938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9648926535613447, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14208512982695667, + "kl": 0.029205322265625, + "learning_rate": 1.0342692252162486e-07, + "loss": 0.0965, + "num_tokens": 2457210276.0, + "reward": 2.4129464626312256, + "reward_std": 0.4732236862182617, + "rewards/accuracy_reward/mean": 0.5486111044883728, + "rewards/accuracy_reward/std": 0.49820831418037415, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14997069537639618, + "step": 4528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1048.1942138671875, + "completions/mean_terminated_length": 807.2437744140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9651057482286506, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11639074797214444, + "kl": 0.025634765625, + "learning_rate": 1.033858103585489e-07, + "loss": 0.0691, + "num_tokens": 2457748123.0, + "reward": 2.353794813156128, + "reward_std": 0.3608246147632599, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1314026117324829, + "step": 4529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1011.888427734375, + "completions/mean_terminated_length": 832.8743286132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9653188428959565, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13563916315316754, + "kl": 0.027374267578125, + "learning_rate": 1.0334494536224146e-07, + "loss": 0.0688, + "num_tokens": 2458273049.0, + "reward": 2.521205425262451, + "reward_std": 0.4386194348335266, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1219823807477951, + "step": 4530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1009.51123046875, + "completions/mean_terminated_length": 773.3616943359375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9655319375632625, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14195993970076673, + "kl": 0.026123046875, + "learning_rate": 1.0330432755531823e-07, + "loss": 0.0948, + "num_tokens": 2458790078.0, + "reward": 2.4017858505249023, + "reward_std": 0.4572906792163849, + "rewards/accuracy_reward/mean": 0.5089285969734192, + "rewards/accuracy_reward/std": 0.5004791617393494, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1531175673007965, + "step": 4531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 952.2813110351562, + "completions/mean_terminated_length": 752.796875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9657450322305684, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11964988566802205, + "kl": 0.02716064453125, + "learning_rate": 1.0326395696025808e-07, + "loss": 0.0512, + "num_tokens": 2459283964.0, + "reward": 2.4893975257873535, + "reward_std": 0.41433700919151306, + "rewards/accuracy_reward/mean": 0.5758928656578064, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.09769836068153381, + "step": 4532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1190.571533203125, + "completions/mean_terminated_length": 989.796142578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9659581268978744, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1113744585120054, + "kl": 0.022186279296875, + "learning_rate": 1.0322383359940299e-07, + "loss": 0.0725, + "num_tokens": 2459890252.0, + "reward": 2.3744421005249023, + "reward_std": 0.48005563020706177, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.12471878528594971, + "step": 4533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1010.8660888671875, + "completions/mean_terminated_length": 781.9618530273438, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9661712215651803, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13680666861839627, + "kl": 0.02691650390625, + "learning_rate": 1.0318395749495825e-07, + "loss": 0.1173, + "num_tokens": 2460414064.0, + "reward": 2.3989956378936768, + "reward_std": 0.41356807947158813, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9547991156578064, + "rewards/tag_count_reward/std": 0.16820663213729858, + "step": 4534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 976.4285888671875, + "completions/mean_terminated_length": 801.0805053710938, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9663843162324863, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11915491317299454, + "kl": 0.03057861328125, + "learning_rate": 1.0314432866899214e-07, + "loss": 0.0506, + "num_tokens": 2460922272.0, + "reward": 2.428013563156128, + "reward_std": 0.42051228880882263, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316656589508, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15285542607307434, + "step": 4535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 940.5781860351562, + "completions/mean_terminated_length": 779.1381225585938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9665974108997922, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12732231780820708, + "kl": 0.030120849609375, + "learning_rate": 1.0310494714343628e-07, + "loss": 0.076, + "num_tokens": 2461409571.0, + "reward": 2.368861675262451, + "reward_std": 0.4147380292415619, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9581473469734192, + "rewards/tag_count_reward/std": 0.16403579711914062, + "step": 4536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 929.857177734375, + "completions/mean_terminated_length": 729.7684326171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9668105055670981, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1328882438595577, + "kl": 0.028839111328125, + "learning_rate": 1.0306581294008524e-07, + "loss": 0.045, + "num_tokens": 2461901907.0, + "reward": 2.34375, + "reward_std": 0.4486410617828369, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.14910244941711426, + "step": 4537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1038.9598388671875, + "completions/mean_terminated_length": 812.8906860351562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9670236002344041, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13439032971320833, + "kl": 0.02667236328125, + "learning_rate": 1.0302692608059685e-07, + "loss": 0.1374, + "num_tokens": 2462432945.0, + "reward": 2.40234375, + "reward_std": 0.40621766448020935, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10681798309087753, + "step": 4538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1048.4866943359375, + "completions/mean_terminated_length": 837.7783813476562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.96723669490171, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.9398485401626545, + "kl": 0.02789306640625, + "learning_rate": 1.0298828658649198e-07, + "loss": 0.0987, + "num_tokens": 2462979755.0, + "reward": 2.3917412757873535, + "reward_std": 0.49047356843948364, + "rewards/accuracy_reward/mean": 0.5245535969734192, + "rewards/accuracy_reward/std": 0.49995502829551697, + "rewards/format_reward/mean": 0.9040178656578064, + "rewards/format_reward/std": 0.29489603638648987, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15610112249851227, + "step": 4539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1021.9107666015625, + "completions/mean_terminated_length": 828.6683959960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9674497895690161, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14057511149569235, + "kl": 0.026763916015625, + "learning_rate": 1.0294989447915468e-07, + "loss": 0.0938, + "num_tokens": 2463507907.0, + "reward": 2.431919813156128, + "reward_std": 0.41900837421417236, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.12707509100437164, + "step": 4540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1114.2098388671875, + "completions/mean_terminated_length": 772.5792236328125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.967662884236322, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11221753872712896, + "kl": 0.02606201171875, + "learning_rate": 1.02911749779832e-07, + "loss": 0.0652, + "num_tokens": 2464075121.0, + "reward": 2.3828125, + "reward_std": 0.35823947191238403, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12016765773296356, + "step": 4541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 994.3303833007812, + "completions/mean_terminated_length": 758.2622680664062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.967875978903628, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1349976569501988, + "kl": 0.02874755859375, + "learning_rate": 1.0287385250963412e-07, + "loss": 0.062, + "num_tokens": 2464586645.0, + "reward": 2.3638393878936768, + "reward_std": 0.5170214176177979, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401403427124, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3124580383300781, + "rewards/tag_count_reward/mean": 0.9464285969734192, + "rewards/tag_count_reward/std": 0.1814327985048294, + "step": 4542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1027.5201416015625, + "completions/mean_terminated_length": 848.0656127929688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9680890735709339, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13378693307174494, + "kl": 0.026947021484375, + "learning_rate": 1.028362026895343e-07, + "loss": 0.0748, + "num_tokens": 2465113086.0, + "reward": 2.4375, + "reward_std": 0.4152251183986664, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.14903545379638672, + "step": 4543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1073.075927734375, + "completions/mean_terminated_length": 841.464111328125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9683021682382399, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11570952379882345, + "kl": 0.025604248046875, + "learning_rate": 1.0279880034036882e-07, + "loss": 0.0439, + "num_tokens": 2465661040.0, + "reward": 2.361607313156128, + "reward_std": 0.4305654466152191, + "rewards/accuracy_reward/mean": 0.4732142984867096, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1512802243232727, + "step": 4544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 965.2991333007812, + "completions/mean_terminated_length": 788.1298828125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9685152629055458, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1235048270019782, + "kl": 0.028564453125, + "learning_rate": 1.0276164548283702e-07, + "loss": 0.0263, + "num_tokens": 2466159414.0, + "reward": 2.450892925262451, + "reward_std": 0.3759031593799591, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971526861190796, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.09910815209150314, + "step": 4545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1102.930908203125, + "completions/mean_terminated_length": 903.7000122070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9687283575728517, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11035869931876219, + "kl": 0.022491455078125, + "learning_rate": 1.0272473813750134e-07, + "loss": 0.0321, + "num_tokens": 2466723495.0, + "reward": 2.388951063156128, + "reward_std": 0.43316563963890076, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1395072489976883, + "step": 4546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 903.7388916015625, + "completions/mean_terminated_length": 769.6234741210938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9689414522401577, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1280188876211085, + "kl": 0.03082275390625, + "learning_rate": 1.0268807832478699e-07, + "loss": 0.0659, + "num_tokens": 2467190466.0, + "reward": 2.525111675262451, + "reward_std": 0.38166525959968567, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067523807287216, + "step": 4547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 933.2053833007812, + "completions/mean_terminated_length": 764.1234130859375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9691545469074636, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.1271939841540721, + "kl": 0.027313232421875, + "learning_rate": 1.026516660649825e-07, + "loss": 0.055, + "num_tokens": 2467682446.0, + "reward": 2.470424175262451, + "reward_std": 0.4126937985420227, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9464285969734192, + "rewards/format_reward/std": 0.2254217267036438, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11780035495758057, + "step": 4548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 952.8370971679688, + "completions/mean_terminated_length": 763.6204223632812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9693676415747696, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13170114432518867, + "kl": 0.030487060546875, + "learning_rate": 1.0261550137823927e-07, + "loss": 0.063, + "num_tokens": 2468174629.0, + "reward": 2.3543527126312256, + "reward_std": 0.3728092908859253, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416089117527008, + "step": 4549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 913.2076416015625, + "completions/mean_terminated_length": 747.7774658203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9695807362420755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13215448944842867, + "kl": 0.031982421875, + "learning_rate": 1.0257958428457169e-07, + "loss": 0.0491, + "num_tokens": 2468651106.0, + "reward": 2.56640625, + "reward_std": 0.3840310275554657, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.11898136883974075, + "step": 4550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 959.5803833007812, + "completions/mean_terminated_length": 784.7564697265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9697938309093815, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12489856807913137, + "kl": 0.027374267578125, + "learning_rate": 1.0254391480385704e-07, + "loss": 0.0675, + "num_tokens": 2469150470.0, + "reward": 2.3833706378936768, + "reward_std": 0.3548505902290344, + "rewards/accuracy_reward/mean": 0.4642857015132904, + "rewards/accuracy_reward/std": 0.4992803633213043, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824846744537354, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.11702417582273483, + "step": 4551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1048.203125, + "completions/mean_terminated_length": 834.1544799804688, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9700069255766874, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1438777115800212, + "kl": 0.025604248046875, + "learning_rate": 1.0250849295583575e-07, + "loss": 0.0963, + "num_tokens": 2469689137.0, + "reward": 2.421875, + "reward_std": 0.4538811445236206, + "rewards/accuracy_reward/mean": 0.5267857313156128, + "rewards/accuracy_reward/std": 0.4998401701450348, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1512802243232727, + "step": 4552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 864.7924194335938, + "completions/mean_terminated_length": 735.92822265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9702200202439933, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1314536151067218, + "kl": 0.031402587890625, + "learning_rate": 1.0247331876011102e-07, + "loss": 0.0247, + "num_tokens": 2470144292.0, + "reward": 2.549107313156128, + "reward_std": 0.41681161522865295, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4803536534309387, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1136813759803772, + "step": 4553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1096.5804443359375, + "completions/mean_terminated_length": 819.6541748046875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9704331149112994, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12417003315360922, + "kl": 0.025177001953125, + "learning_rate": 1.0243839223614909e-07, + "loss": 0.0282, + "num_tokens": 2470705624.0, + "reward": 2.37890625, + "reward_std": 0.4431673288345337, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.15557540953159332, + "step": 4554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1038.321533203125, + "completions/mean_terminated_length": 822.1572265625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9706462095786053, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11236099387145804, + "kl": 0.026214599609375, + "learning_rate": 1.0240371340327916e-07, + "loss": 0.0634, + "num_tokens": 2471237592.0, + "reward": 2.431919813156128, + "reward_std": 0.43935853242874146, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15750236809253693, + "step": 4555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1035.368408203125, + "completions/mean_terminated_length": 811.8719482421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9708593042459113, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12001272095026856, + "kl": 0.026947021484375, + "learning_rate": 1.023692822806933e-07, + "loss": 0.0727, + "num_tokens": 2471767325.0, + "reward": 2.579799175262451, + "reward_std": 0.40030691027641296, + "rewards/accuracy_reward/mean": 0.6584821343421936, + "rewards/accuracy_reward/std": 0.4747488796710968, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.14126798510551453, + "step": 4556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 984.2991333007812, + "completions/mean_terminated_length": 793.95263671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9710723989132172, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12931451861264318, + "kl": 0.02813720703125, + "learning_rate": 1.0233509888744648e-07, + "loss": 0.0862, + "num_tokens": 2472275763.0, + "reward": 2.4525671005249023, + "reward_std": 0.4722106456756592, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664569377899, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14418935775756836, + "step": 4557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1028.578125, + "completions/mean_terminated_length": 858.6744995117188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9712854935805232, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1302239957203249, + "kl": 0.02630615234375, + "learning_rate": 1.023011632424566e-07, + "loss": 0.0827, + "num_tokens": 2472809110.0, + "reward": 2.4910714626312256, + "reward_std": 0.5058425664901733, + "rewards/accuracy_reward/mean": 0.6138392686843872, + "rewards/accuracy_reward/std": 0.4874124526977539, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.1531175673007965, + "step": 4558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 981.0156860351562, + "completions/mean_terminated_length": 809.6347045898438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9714985882478291, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12474688090803428, + "kl": 0.028106689453125, + "learning_rate": 1.0226747536450442e-07, + "loss": 0.0549, + "num_tokens": 2473315469.0, + "reward": 2.3973214626312256, + "reward_std": 0.4472970962524414, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.10992966592311859, + "step": 4559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 962.1317138671875, + "completions/mean_terminated_length": 733.2189331054688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9717116829151351, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.35121812887412146, + "kl": 0.0341796875, + "learning_rate": 1.0223403527223367e-07, + "loss": 0.0489, + "num_tokens": 2473816424.0, + "reward": 2.5, + "reward_std": 0.4159477949142456, + "rewards/accuracy_reward/mean": 0.6004464030265808, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.1249600425362587, + "step": 4560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1056.7567138671875, + "completions/mean_terminated_length": 817.8698120117188, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.971924777582441, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12869293517339084, + "kl": 0.02471923828125, + "learning_rate": 1.0220084298415081e-07, + "loss": 0.0527, + "num_tokens": 2474355131.0, + "reward": 2.3956475257873535, + "reward_std": 0.4562458395957947, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15870553255081177, + "step": 4561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 994.1785888671875, + "completions/mean_terminated_length": 750.989013671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9721378722497469, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13230402152417267, + "kl": 0.028961181640625, + "learning_rate": 1.0216789851862526e-07, + "loss": 0.0875, + "num_tokens": 2474874651.0, + "reward": 2.4213171005249023, + "reward_std": 0.4332205355167389, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13416090607643127, + "step": 4562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1111.680908203125, + "completions/mean_terminated_length": 817.8797607421875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9723509669170529, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11665414636447166, + "kl": 0.02392578125, + "learning_rate": 1.021352018938892e-07, + "loss": 0.0833, + "num_tokens": 2475447884.0, + "reward": 2.3348214626312256, + "reward_std": 0.48968520760536194, + "rewards/accuracy_reward/mean": 0.4330357015132904, + "rewards/accuracy_reward/std": 0.4960494339466095, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.14672233164310455, + "step": 4563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1009.5402221679688, + "completions/mean_terminated_length": 737.492919921875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9725640615843588, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1426924409758636, + "kl": 0.026702880859375, + "learning_rate": 1.0210275312803782e-07, + "loss": 0.0999, + "num_tokens": 2475973582.0, + "reward": 2.431361675262451, + "reward_std": 0.370706707239151, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9893973469734192, + "rewards/tag_count_reward/std": 0.08036740869283676, + "step": 4564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1090.65185546875, + "completions/mean_terminated_length": 853.3147583007812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9727771562516648, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12053280080268028, + "kl": 0.0255126953125, + "learning_rate": 1.0207055223902898e-07, + "loss": 0.0836, + "num_tokens": 2476533602.0, + "reward": 2.337611675262451, + "reward_std": 0.4732224941253662, + "rewards/accuracy_reward/mean": 0.4419642984867096, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13748814165592194, + "step": 4565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1028.6429443359375, + "completions/mean_terminated_length": 823.6782836914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9729902509189707, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13058058956208124, + "kl": 0.026519775390625, + "learning_rate": 1.0203859924468339e-07, + "loss": 0.0671, + "num_tokens": 2477061170.0, + "reward": 2.443638563156128, + "reward_std": 0.49733930826187134, + "rewards/accuracy_reward/mean": 0.5856481194496155, + "rewards/accuracy_reward/std": 0.49318093061447144, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.16821405291557312, + "step": 4566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 930.46435546875, + "completions/mean_terminated_length": 757.6494750976562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9732033455862767, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14468255213054854, + "kl": 0.03131103515625, + "learning_rate": 1.0200689416268454e-07, + "loss": 0.0355, + "num_tokens": 2477548370.0, + "reward": 2.4386162757873535, + "reward_std": 0.4184619188308716, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.10404275357723236, + "step": 4567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1041.3660888671875, + "completions/mean_terminated_length": 822.5326538085938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9734164402535826, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11402298933738973, + "kl": 0.025634765625, + "learning_rate": 1.0197543701057887e-07, + "loss": 0.0587, + "num_tokens": 2478082838.0, + "reward": 2.373326063156128, + "reward_std": 0.3784697353839874, + "rewards/accuracy_reward/mean": 0.4308035671710968, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174957811832428, + "rewards/tag_count_reward/mean": 0.9849330186843872, + "rewards/tag_count_reward/std": 0.08956517279148102, + "step": 4568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 941.4910888671875, + "completions/mean_terminated_length": 757.0729370117188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9736295349208886, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.14566787121356425, + "kl": 0.031463623046875, + "learning_rate": 1.019442278057755e-07, + "loss": 0.0356, + "num_tokens": 2478578034.0, + "reward": 2.3253350257873535, + "reward_std": 0.40917083621025085, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9771205186843872, + "rewards/tag_count_reward/std": 0.1091761440038681, + "step": 4569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 950.0491333007812, + "completions/mean_terminated_length": 776.9871215820312, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9738426295881946, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1275142344408925, + "kl": 0.03118896484375, + "learning_rate": 1.0191326656554624e-07, + "loss": 0.0561, + "num_tokens": 2479071576.0, + "reward": 2.5223214626312256, + "reward_std": 0.42247384786605835, + "rewards/accuracy_reward/mean": 0.6071428656578064, + "rewards/accuracy_reward/std": 0.48893147706985474, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08709356933832169, + "step": 4570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 982.7366333007812, + "completions/mean_terminated_length": 778.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9740557242555005, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1374145416413502, + "kl": 0.02734375, + "learning_rate": 1.0188255330702583e-07, + "loss": 0.0596, + "num_tokens": 2479580066.0, + "reward": 2.470982313156128, + "reward_std": 0.34999993443489075, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.13361193239688873, + "step": 4571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1067.622802734375, + "completions/mean_terminated_length": 854.497314453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9742688189228065, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 7.422554205054337, + "kl": 0.237060546875, + "learning_rate": 1.018520880472117e-07, + "loss": 0.0638, + "num_tokens": 2480132409.0, + "reward": 2.3203125, + "reward_std": 0.4069780707359314, + "rewards/accuracy_reward/mean": 0.4151785671710968, + "rewards/accuracy_reward/std": 0.49330368638038635, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13189572095870972, + "step": 4572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1163.46875, + "completions/mean_terminated_length": 882.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9744819135901124, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.119016318684159, + "kl": 0.023284912109375, + "learning_rate": 1.0182187080296403e-07, + "loss": 0.0789, + "num_tokens": 2480732875.0, + "reward": 2.3052456378936768, + "reward_std": 0.4759340286254883, + "rewards/accuracy_reward/mean": 0.4017857015132904, + "rewards/accuracy_reward/std": 0.49080711603164673, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.1552460491657257, + "step": 4573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 976.43310546875, + "completions/mean_terminated_length": 757.5107421875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.9746950082574184, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11748737471838662, + "kl": 0.027069091796875, + "learning_rate": 1.0179190159100574e-07, + "loss": 0.0591, + "num_tokens": 2481230797.0, + "reward": 2.5513393878936768, + "reward_std": 0.3492380678653717, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1086503118276596, + "step": 4574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1069.665283203125, + "completions/mean_terminated_length": 781.2543334960938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9749081029247243, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12790316023937567, + "kl": 0.026153564453125, + "learning_rate": 1.0176218042792257e-07, + "loss": 0.0705, + "num_tokens": 2481780887.0, + "reward": 2.3504464626312256, + "reward_std": 0.46307647228240967, + "rewards/accuracy_reward/mean": 0.4799107015132904, + "rewards/accuracy_reward/std": 0.5001547336578369, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9553571343421936, + "rewards/tag_count_reward/std": 0.17041954398155212, + "step": 4575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1056.8504638671875, + "completions/mean_terminated_length": 779.3285522460938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9751211975920303, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1045995023309239, + "kl": 0.025909423828125, + "learning_rate": 1.017327073301628e-07, + "loss": 0.042, + "num_tokens": 2482327972.0, + "reward": 2.4481027126312256, + "reward_std": 0.4026567041873932, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9748883843421936, + "rewards/tag_count_reward/std": 0.12870891392230988, + "step": 4576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1190.80810546875, + "completions/mean_terminated_length": 873.620849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9753342922593362, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11929608074266028, + "kl": 0.0224609375, + "learning_rate": 1.0170348231403762e-07, + "loss": 0.0587, + "num_tokens": 2482933998.0, + "reward": 2.1796875, + "reward_std": 0.4283905327320099, + "rewards/accuracy_reward/mean": 0.3013392984867096, + "rewards/accuracy_reward/std": 0.4593527019023895, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1464626044034958, + "step": 4577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 948.1272583007812, + "completions/mean_terminated_length": 726.9732055664062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9755473869266421, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14064811189011178, + "kl": 0.028045654296875, + "learning_rate": 1.016745053957208e-07, + "loss": 0.0678, + "num_tokens": 2483431239.0, + "reward": 2.3744421005249023, + "reward_std": 0.4493979513645172, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.1456623673439026, + "step": 4578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1106.3616943359375, + "completions/mean_terminated_length": 818.10498046875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.9757604815939481, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12180798588477937, + "kl": 0.025299072265625, + "learning_rate": 1.0164577659124884e-07, + "loss": 0.0548, + "num_tokens": 2483992457.0, + "reward": 2.3638393878936768, + "reward_std": 0.40271928906440735, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.13480259478092194, + "step": 4579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 990.9464721679688, + "completions/mean_terminated_length": 757.645751953125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.975973576261254, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13191626035692133, + "kl": 0.027130126953125, + "learning_rate": 1.0161729591652094e-07, + "loss": 0.1161, + "num_tokens": 2484508913.0, + "reward": 2.4408483505249023, + "reward_std": 0.44707101583480835, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13709372282028198, + "step": 4580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 909.1585083007812, + "completions/mean_terminated_length": 749.7786254882812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.97618667092856, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12290649022163078, + "kl": 0.0291748046875, + "learning_rate": 1.0158906338729903e-07, + "loss": 0.054, + "num_tokens": 2484984184.0, + "reward": 2.4402902126312256, + "reward_std": 0.363080233335495, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396106719971, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1131737232208252, + "step": 4581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 878.5647583007812, + "completions/mean_terminated_length": 721.6531982421875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9763997655958659, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12968401178556244, + "kl": 0.02911376953125, + "learning_rate": 1.0156107901920756e-07, + "loss": 0.0215, + "num_tokens": 2485446485.0, + "reward": 2.4034600257873535, + "reward_std": 0.3069804310798645, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855974316596985, + "rewards/format_reward/mean": 0.9642857313156128, + "rewards/format_reward/std": 0.18578432500362396, + "rewards/tag_count_reward/mean": 0.9838169813156128, + "rewards/tag_count_reward/std": 0.09244592487812042, + "step": 4582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 920.9933471679688, + "completions/mean_terminated_length": 763.2697143554688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.976612860263172, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14293294193855025, + "kl": 0.029754638671875, + "learning_rate": 1.015333428277338e-07, + "loss": 0.0642, + "num_tokens": 2485925250.0, + "reward": 2.4029018878936768, + "reward_std": 0.4738227427005768, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2918064594268799, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1576608121395111, + "step": 4583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1044.544677734375, + "completions/mean_terminated_length": 799.2555541992188, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9768259549304779, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12787123560856678, + "kl": 0.02716064453125, + "learning_rate": 1.0150585482822757e-07, + "loss": 0.0715, + "num_tokens": 2486468262.0, + "reward": 2.318638563156128, + "reward_std": 0.4058946371078491, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13099703192710876, + "step": 4584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1056.6875, + "completions/mean_terminated_length": 782.7350463867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9770390495977839, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10968157446980062, + "kl": 0.024688720703125, + "learning_rate": 1.014786150359014e-07, + "loss": 0.0812, + "num_tokens": 2487016842.0, + "reward": 2.3510046005249023, + "reward_std": 0.3474770188331604, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291378259658813, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.1098259910941124, + "step": 4585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1040.0335693359375, + "completions/mean_terminated_length": 817.5667724609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9772521442650898, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12099648631276447, + "kl": 0.0257568359375, + "learning_rate": 1.0145162346583037e-07, + "loss": 0.078, + "num_tokens": 2487555849.0, + "reward": 2.4129464626312256, + "reward_std": 0.4698973000049591, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9620535969734192, + "rewards/tag_count_reward/std": 0.16112655401229858, + "step": 4586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 844.7835083007812, + "completions/mean_terminated_length": 693.6256103515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9774652389323957, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15932053810230012, + "kl": 0.0350341796875, + "learning_rate": 1.0142488013295241e-07, + "loss": 0.0877, + "num_tokens": 2488005288.0, + "reward": 2.4213171005249023, + "reward_std": 0.4079744517803192, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1226801648736, + "step": 4587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1046.857177734375, + "completions/mean_terminated_length": 858.31298828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9776783335997017, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12455129273202828, + "kl": 0.024261474609375, + "learning_rate": 1.013983850520677e-07, + "loss": 0.0507, + "num_tokens": 2488558776.0, + "reward": 2.5066964626312256, + "reward_std": 0.45638835430145264, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11967316269874573, + "step": 4588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 853.8705444335938, + "completions/mean_terminated_length": 669.2113037109375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9778914282670076, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1207488082318332, + "kl": 0.031341552734375, + "learning_rate": 1.0137213823783937e-07, + "loss": 0.0634, + "num_tokens": 2489010030.0, + "reward": 2.510044813156128, + "reward_std": 0.35181424021720886, + "rewards/accuracy_reward/mean": 0.6026785969734192, + "rewards/accuracy_reward/std": 0.48989060521125793, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1421017199754715, + "step": 4589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1133.2835693359375, + "completions/mean_terminated_length": 846.260986328125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9781045229343136, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10599325655790867, + "kl": 0.022796630859375, + "learning_rate": 1.0134613970479301e-07, + "loss": 0.0388, + "num_tokens": 2489588333.0, + "reward": 2.365513563156128, + "reward_std": 0.3733319938182831, + "rewards/accuracy_reward/mean": 0.46990740299224854, + "rewards/accuracy_reward/std": 0.4996722936630249, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.13148215413093567, + "step": 4590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 860.8772583007812, + "completions/mean_terminated_length": 704.992431640625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9783176176016195, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14047312865017636, + "kl": 0.03009033203125, + "learning_rate": 1.013203894673168e-07, + "loss": 0.0675, + "num_tokens": 2490038246.0, + "reward": 2.4849331378936768, + "reward_std": 0.3962576687335968, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12224180996417999, + "step": 4591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 947.3616333007812, + "completions/mean_terminated_length": 767.2571411132812, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9785307122689255, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12733132700606797, + "kl": 0.029327392578125, + "learning_rate": 1.0129488753966151e-07, + "loss": 0.0496, + "num_tokens": 2490523560.0, + "reward": 2.4603796005249023, + "reward_std": 0.4324565827846527, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.13423532247543335, + "step": 4592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 924.8013916015625, + "completions/mean_terminated_length": 741.0051879882812, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9787438069362314, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.14259887359901344, + "kl": 0.032470703125, + "learning_rate": 1.0126963393594051e-07, + "loss": 0.0807, + "num_tokens": 2491008015.0, + "reward": 2.5691964626312256, + "reward_std": 0.45529991388320923, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.470055490732193, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.1523328423500061, + "step": 4593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1029.247802734375, + "completions/mean_terminated_length": 773.1368408203125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9789569016035373, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.122596561352527, + "kl": 0.027923583984375, + "learning_rate": 1.0124462867012975e-07, + "loss": 0.082, + "num_tokens": 2491532590.0, + "reward": 2.4559152126312256, + "reward_std": 0.375784695148468, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11067523807287216, + "step": 4594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 852.6272583007812, + "completions/mean_terminated_length": 705.8270874023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9791699962708433, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12906195772956755, + "kl": 0.0325927734375, + "learning_rate": 1.0121987175606772e-07, + "loss": 0.0407, + "num_tokens": 2491977511.0, + "reward": 2.571986675262451, + "reward_std": 0.37627726793289185, + "rewards/accuracy_reward/mean": 0.6607142686843872, + "rewards/accuracy_reward/std": 0.47399622201919556, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.1246887594461441, + "step": 4595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1011.9063110351562, + "completions/mean_terminated_length": 823.277099609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9793830909381492, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1282979030935075, + "kl": 0.02593994140625, + "learning_rate": 1.011953632074555e-07, + "loss": 0.0267, + "num_tokens": 2492501805.0, + "reward": 2.3470983505249023, + "reward_std": 0.4520379602909088, + "rewards/accuracy_reward/mean": 0.4575892984867096, + "rewards/accuracy_reward/std": 0.4987550377845764, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.1478201001882553, + "step": 4596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 911.2678833007812, + "completions/mean_terminated_length": 735.4844970703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9795961856054552, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1407619558062639, + "kl": 0.02862548828125, + "learning_rate": 1.0117110303785667e-07, + "loss": 0.0202, + "num_tokens": 2492975141.0, + "reward": 2.458705425262451, + "reward_std": 0.34212526679039, + "rewards/accuracy_reward/mean": 0.5357142686843872, + "rewards/accuracy_reward/std": 0.4992803931236267, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11632467061281204, + "step": 4597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1026.01123046875, + "completions/mean_terminated_length": 817.2177124023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9798092802727612, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12161803198252391, + "kl": 0.028228759765625, + "learning_rate": 1.0114709126069738e-07, + "loss": 0.0764, + "num_tokens": 2493500330.0, + "reward": 2.4481027126312256, + "reward_std": 0.4298285245895386, + "rewards/accuracy_reward/mean": 0.5580357313156128, + "rewards/accuracy_reward/std": 0.4971756041049957, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407156348228455, + "rewards/tag_count_reward/mean": 0.9592633843421936, + "rewards/tag_count_reward/std": 0.16000598669052124, + "step": 4598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1112.7388916015625, + "completions/mean_terminated_length": 833.5159912109375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9800223749400672, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13233298573463154, + "kl": 0.023590087890625, + "learning_rate": 1.0112332788926631e-07, + "loss": 0.1167, + "num_tokens": 2494073797.0, + "reward": 2.3950893878936768, + "reward_std": 0.4518040418624878, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11967317014932632, + "step": 4599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1042.6273193359375, + "completions/mean_terminated_length": 820.7329711914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9802354696073731, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13560477908488563, + "kl": 0.027069091796875, + "learning_rate": 1.010998129367147e-07, + "loss": 0.0945, + "num_tokens": 2494612846.0, + "reward": 2.34765625, + "reward_std": 0.4202481508255005, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12825222313404083, + "step": 4600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 988.6563110351562, + "completions/mean_terminated_length": 772.231201171875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9804485642746791, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1288056139294807, + "kl": 0.02947998046875, + "learning_rate": 1.010765464160562e-07, + "loss": 0.0808, + "num_tokens": 2495126180.0, + "reward": 2.4263393878936768, + "reward_std": 0.3949876129627228, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1086503118276596, + "step": 4601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1091.243408203125, + "completions/mean_terminated_length": 850.7178344726562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.980661658941985, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12712819782148058, + "kl": 0.024444580078125, + "learning_rate": 1.0105352834016717e-07, + "loss": 0.1197, + "num_tokens": 2495679793.0, + "reward": 2.400669813156128, + "reward_std": 0.4086795747280121, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.11899843066930771, + "step": 4602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1023.8951416015625, + "completions/mean_terminated_length": 843.8031616210938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9808747536092909, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14247333059229758, + "kl": 0.025482177734375, + "learning_rate": 1.0103075872178624e-07, + "loss": 0.0854, + "num_tokens": 2496201938.0, + "reward": 2.3599331378936768, + "reward_std": 0.46483901143074036, + "rewards/accuracy_reward/mean": 0.4665178656578064, + "rewards/accuracy_reward/std": 0.4994353652000427, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13335825502872467, + "step": 4603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 934.2188110351562, + "completions/mean_terminated_length": 812.9158325195312, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9810878482765969, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1301817503346622, + "kl": 0.027374267578125, + "learning_rate": 1.0100823757351468e-07, + "loss": 0.0463, + "num_tokens": 2496693236.0, + "reward": 2.5145089626312256, + "reward_std": 0.4172426760196686, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936831295490265, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12292034178972244, + "step": 4604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 897.8839721679688, + "completions/mean_terminated_length": 740.2537841796875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9813009429439028, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.15066847975770556, + "kl": 0.0321044921875, + "learning_rate": 1.0098596490781626e-07, + "loss": 0.0622, + "num_tokens": 2497171664.0, + "reward": 2.51171875, + "reward_std": 0.39012038707733154, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1041671633720398, + "step": 4605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1234.9710693359375, + "completions/mean_terminated_length": 976.7147216796875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9815140376112088, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.10381698427275403, + "kl": 0.02154541015625, + "learning_rate": 1.0096394073701716e-07, + "loss": 0.0598, + "num_tokens": 2497798003.0, + "reward": 2.322544813156128, + "reward_std": 0.47268208861351013, + "rewards/accuracy_reward/mean": 0.4486607015132904, + "rewards/accuracy_reward/std": 0.49791330099105835, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.16375111043453217, + "step": 4606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1064.555908203125, + "completions/mean_terminated_length": 834.2727661132812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9817271322785147, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.4041997187709315, + "kl": 0.03558349609375, + "learning_rate": 1.0094216507330605e-07, + "loss": 0.0733, + "num_tokens": 2498349596.0, + "reward": 2.4285714626312256, + "reward_std": 0.4024399518966675, + "rewards/accuracy_reward/mean": 0.5044642686843872, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093553841114044, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12607400119304657, + "step": 4607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 908.0045166015625, + "completions/mean_terminated_length": 751.7614135742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9819402269458207, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.15021375593534864, + "kl": 0.02667236328125, + "learning_rate": 1.0092063792873417e-07, + "loss": 0.1023, + "num_tokens": 2498825726.0, + "reward": 2.4190850257873535, + "reward_std": 0.3227223753929138, + "rewards/accuracy_reward/mean": 0.48148149251937866, + "rewards/accuracy_reward/std": 0.5002362728118896, + "rewards/format_reward/mean": 0.9732142686843872, + "rewards/format_reward/std": 0.1616371124982834, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10210946202278137, + "step": 4608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1027.290283203125, + "completions/mean_terminated_length": 781.3019409179688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9821533216131266, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.11745036497856069, + "kl": 0.02557373046875, + "learning_rate": 1.0089935931521508e-07, + "loss": 0.0819, + "num_tokens": 2499356256.0, + "reward": 2.4698662757873535, + "reward_std": 0.3586699664592743, + "rewards/accuracy_reward/mean": 0.5513392686843872, + "rewards/accuracy_reward/std": 0.49791327118873596, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.12292034178972244, + "step": 4609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 991.9598388671875, + "completions/mean_terminated_length": 796.3967895507812, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9823664162804325, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13498829912557073, + "kl": 0.026824951171875, + "learning_rate": 1.008783292445249e-07, + "loss": 0.0652, + "num_tokens": 2499875054.0, + "reward": 2.4408483505249023, + "reward_std": 0.41820070147514343, + "rewards/accuracy_reward/mean": 0.5334821343421936, + "rewards/accuracy_reward/std": 0.4994353950023651, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14262787997722626, + "step": 4610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 909.919677734375, + "completions/mean_terminated_length": 763.7178344726562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9825795109477385, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13713169826934785, + "kl": 0.029632568359375, + "learning_rate": 1.0085754772830213e-07, + "loss": 0.0644, + "num_tokens": 2500349610.0, + "reward": 2.5970983505249023, + "reward_std": 0.38837674260139465, + "rewards/accuracy_reward/mean": 0.6919642686843872, + "rewards/accuracy_reward/std": 0.46219751238822937, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.13083133101463318, + "step": 4611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1118.1785888671875, + "completions/mean_terminated_length": 829.98828125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9827926056150444, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11933252141063329, + "kl": 0.02288818359375, + "learning_rate": 1.0083701477804778e-07, + "loss": 0.0623, + "num_tokens": 2500928010.0, + "reward": 2.341517925262451, + "reward_std": 0.47632312774658203, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.15048591792583466, + "step": 4612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 965.450927734375, + "completions/mean_terminated_length": 798.04638671875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9830057002823505, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14426238466822397, + "kl": 0.027313232421875, + "learning_rate": 1.0081673040512528e-07, + "loss": 0.0882, + "num_tokens": 2501428276.0, + "reward": 2.5853796005249023, + "reward_std": 0.4392961263656616, + "rewards/accuracy_reward/mean": 0.6674107313156128, + "rewards/accuracy_reward/std": 0.47166746854782104, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11561822891235352, + "step": 4613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 916.4063110351562, + "completions/mean_terminated_length": 724.3603515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9832187949496564, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13164649836017533, + "kl": 0.02789306640625, + "learning_rate": 1.0079669462076038e-07, + "loss": 0.0466, + "num_tokens": 2501899546.0, + "reward": 2.6400671005249023, + "reward_std": 0.3391387164592743, + "rewards/accuracy_reward/mean": 0.7098214030265808, + "rewards/accuracy_reward/std": 0.4543519914150238, + "rewards/format_reward/mean": 0.9486607313156128, + "rewards/format_reward/std": 0.22093555331230164, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.0950164794921875, + "step": 4614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 950.8906860351562, + "completions/mean_terminated_length": 768.0390625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9834318896169624, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13825253741956947, + "kl": 0.028564453125, + "learning_rate": 1.0077690743604151e-07, + "loss": 0.0722, + "num_tokens": 2502387833.0, + "reward": 2.482701063156128, + "reward_std": 0.36174842715263367, + "rewards/accuracy_reward/mean": 0.5915178656578064, + "rewards/accuracy_reward/std": 0.49210265278816223, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822286784648895, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.1131737232208252, + "step": 4615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1017.6317138671875, + "completions/mean_terminated_length": 800.4189453125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9836449842842683, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12025794610810049, + "kl": 0.027008056640625, + "learning_rate": 1.0075736886191923e-07, + "loss": 0.0699, + "num_tokens": 2502920884.0, + "reward": 2.4609375, + "reward_std": 0.40975409746170044, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1286761611700058, + "step": 4616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1011.6406860351562, + "completions/mean_terminated_length": 789.7642211914062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9838580789515743, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13748170244070565, + "kl": 0.02825927734375, + "learning_rate": 1.0073807890920672e-07, + "loss": 0.0477, + "num_tokens": 2503452307.0, + "reward": 2.446986675262451, + "reward_std": 0.4406866133213043, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509716033935547, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9603794813156128, + "rewards/tag_count_reward/std": 0.1503853052854538, + "step": 4617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 995.7433471679688, + "completions/mean_terminated_length": 749.3471069335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9840711736188802, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11010353512670458, + "kl": 0.027984619140625, + "learning_rate": 1.0071903758857942e-07, + "loss": 0.061, + "num_tokens": 2503964544.0, + "reward": 2.3627233505249023, + "reward_std": 0.4092611074447632, + "rewards/accuracy_reward/mean": 0.4722222089767456, + "rewards/accuracy_reward/std": 0.49980661273002625, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14360485970973969, + "step": 4618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 994.6339721679688, + "completions/mean_terminated_length": 799.5661010742188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9842842682861861, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13308786196167907, + "kl": 0.026397705078125, + "learning_rate": 1.0070024491057531e-07, + "loss": 0.1027, + "num_tokens": 2504472860.0, + "reward": 2.41796875, + "reward_std": 0.4522850513458252, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9626116156578064, + "rewards/tag_count_reward/std": 0.1581934094429016, + "step": 4619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 842.2857666015625, + "completions/mean_terminated_length": 694.2155151367188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9844973629534921, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.14670130252315636, + "kl": 0.032196044921875, + "learning_rate": 1.0068170088559468e-07, + "loss": 0.059, + "num_tokens": 2504929516.0, + "reward": 2.5379464626312256, + "reward_std": 0.41874274611473083, + "rewards/accuracy_reward/mean": 0.6116071343421936, + "rewards/accuracy_reward/std": 0.4879295527935028, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10688954591751099, + "step": 4620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1022.2031860351562, + "completions/mean_terminated_length": 848.1123046875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.984710457620798, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12000329455197178, + "kl": 0.025909423828125, + "learning_rate": 1.0066340552390021e-07, + "loss": 0.0661, + "num_tokens": 2505457735.0, + "reward": 2.463169813156128, + "reward_std": 0.4484398663043976, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422912120819, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9676339030265808, + "rewards/tag_count_reward/std": 0.13507550954818726, + "step": 4621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 895.5379638671875, + "completions/mean_terminated_length": 737.5863037109375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.984923552288104, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.15414747514470606, + "kl": 0.02978515625, + "learning_rate": 1.0064535883561705e-07, + "loss": 0.1071, + "num_tokens": 2505932280.0, + "reward": 2.5474331378936768, + "reward_std": 0.3957427740097046, + "rewards/accuracy_reward/mean": 0.6643518805503845, + "rewards/accuracy_reward/std": 0.4727640450000763, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.13802284002304077, + "step": 4622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 900.2188110351562, + "completions/mean_terminated_length": 778.3555908203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9851366469554099, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12845038849644544, + "kl": 0.03033447265625, + "learning_rate": 1.0062756083073256e-07, + "loss": 0.0467, + "num_tokens": 2506400362.0, + "reward": 2.4849331378936768, + "reward_std": 0.3705940842628479, + "rewards/accuracy_reward/mean": 0.5825892686843872, + "rewards/accuracy_reward/std": 0.4936830997467041, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.1127205565571785, + "step": 4623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 943.4285888671875, + "completions/mean_terminated_length": 772.6185302734375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9853497416227159, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13469290304824677, + "kl": 0.0286865234375, + "learning_rate": 1.0061001151909662e-07, + "loss": 0.0485, + "num_tokens": 2506889450.0, + "reward": 2.5357143878936768, + "reward_std": 0.3804994821548462, + "rewards/accuracy_reward/mean": 0.6160714030265808, + "rewards/accuracy_reward/std": 0.48688453435897827, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9799107313156128, + "rewards/tag_count_reward/std": 0.11036036163568497, + "step": 4624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 962.013427734375, + "completions/mean_terminated_length": 816.2987670898438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9855628362900218, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.15586313497081986, + "kl": 0.030303955078125, + "learning_rate": 1.0059271091042145e-07, + "loss": 0.0677, + "num_tokens": 2507397360.0, + "reward": 2.513392925262451, + "reward_std": 0.5189526081085205, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.1353386640548706, + "step": 4625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 951.8504638671875, + "completions/mean_terminated_length": 779.0723876953125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9857759309573277, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12537519947580655, + "kl": 0.028778076171875, + "learning_rate": 1.005756590142816e-07, + "loss": 0.092, + "num_tokens": 2507891789.0, + "reward": 2.510044813156128, + "reward_std": 0.4289758503437042, + "rewards/accuracy_reward/mean": 0.6294642686843872, + "rewards/accuracy_reward/std": 0.48348814249038696, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9587053656578064, + "rewards/tag_count_reward/std": 0.1662929803133011, + "step": 4626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 973.9933471679688, + "completions/mean_terminated_length": 768.3323974609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9859890256246338, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1400751997365586, + "kl": 0.02630615234375, + "learning_rate": 1.00558855840114e-07, + "loss": 0.0999, + "num_tokens": 2508400586.0, + "reward": 2.4229912757873535, + "reward_std": 0.46192002296447754, + "rewards/accuracy_reward/mean": 0.5290178656578064, + "rewards/accuracy_reward/std": 0.49971529841423035, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1599874496459961, + "step": 4627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1098.7076416015625, + "completions/mean_terminated_length": 869.9307250976562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9862021202919397, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11808584481360845, + "kl": 0.0238037109375, + "learning_rate": 1.005423013972179e-07, + "loss": 0.0501, + "num_tokens": 2508967175.0, + "reward": 2.357142925262451, + "reward_std": 0.44420138001441956, + "rewards/accuracy_reward/mean": 0.4397321343421936, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10013572871685028, + "step": 4628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1105.7098388671875, + "completions/mean_terminated_length": 848.7216186523438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9864152149592457, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.11702725180264857, + "kl": 0.02337646484375, + "learning_rate": 1.0052599569475489e-07, + "loss": 0.0427, + "num_tokens": 2509538293.0, + "reward": 2.404576063156128, + "reward_std": 0.4085484445095062, + "rewards/accuracy_reward/mean": 0.4933035671710968, + "rewards/accuracy_reward/std": 0.5005140900611877, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 4629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 992.8482666015625, + "completions/mean_terminated_length": 816.9896240234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9866283096265516, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13375888588934118, + "kl": 0.0289306640625, + "learning_rate": 1.0050993874174902e-07, + "loss": 0.097, + "num_tokens": 2510045265.0, + "reward": 2.4129464626312256, + "reward_std": 0.4630640745162964, + "rewards/accuracy_reward/mean": 0.5133928656578064, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.1299937218427658, + "step": 4630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1063.446533203125, + "completions/mean_terminated_length": 805.5211181640625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.9868414042938576, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.13318615237794618, + "kl": 0.027679443359375, + "learning_rate": 1.0049413054708648e-07, + "loss": 0.0862, + "num_tokens": 2510603977.0, + "reward": 2.3761162757873535, + "reward_std": 0.5042670369148254, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.8950892686843872, + "rewards/format_reward/std": 0.3067809045314789, + "rewards/tag_count_reward/mean": 0.9430803656578064, + "rewards/tag_count_reward/std": 0.18948033452033997, + "step": 4631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1013.6920166015625, + "completions/mean_terminated_length": 812.3466796875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9870544989611635, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.12440073149268047, + "kl": 0.0257568359375, + "learning_rate": 1.0047857111951591e-07, + "loss": 0.0582, + "num_tokens": 2511132255.0, + "reward": 2.435826063156128, + "reward_std": 0.39106255769729614, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.13894234597682953, + "step": 4632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1066.33935546875, + "completions/mean_terminated_length": 787.8739624023438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9872675936284695, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.10502918809200727, + "kl": 0.024810791015625, + "learning_rate": 1.0046326046764833e-07, + "loss": 0.0421, + "num_tokens": 2511684167.0, + "reward": 2.3314733505249023, + "reward_std": 0.3549799621105194, + "rewards/accuracy_reward/mean": 0.4241071343421936, + "rewards/accuracy_reward/std": 0.4947591722011566, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.1292569637298584, + "step": 4633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 985.3125610351562, + "completions/mean_terminated_length": 798.4356689453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9874806882957754, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13393003132771097, + "kl": 0.029205322265625, + "learning_rate": 1.0044819859995701e-07, + "loss": 0.1131, + "num_tokens": 2512197987.0, + "reward": 2.4347100257873535, + "reward_std": 0.4993771016597748, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9129464030265808, + "rewards/format_reward/std": 0.2822287082672119, + "rewards/tag_count_reward/mean": 0.9614955186843872, + "rewards/tag_count_reward/std": 0.14691685140132904, + "step": 4634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 966.82373046875, + "completions/mean_terminated_length": 735.352294921875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9876937829630813, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14013623607782247, + "kl": 0.02880859375, + "learning_rate": 1.0043338552477749e-07, + "loss": 0.0454, + "num_tokens": 2512697412.0, + "reward": 2.392857313156128, + "reward_std": 0.3513668477535248, + "rewards/accuracy_reward/mean": 0.4754464328289032, + "rewards/accuracy_reward/std": 0.4999549984931946, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9732142686843872, + "rewards/tag_count_reward/std": 0.12449962645769119, + "step": 4635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 1054.484375, + "completions/mean_terminated_length": 790.6694946289062, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9879068776303873, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1515832031858971, + "kl": 0.025665283203125, + "learning_rate": 1.0041882125030765e-07, + "loss": 0.0709, + "num_tokens": 2513240061.0, + "reward": 2.3247768878936768, + "reward_std": 0.39368030428886414, + "rewards/accuracy_reward/mean": 0.44675925374031067, + "rewards/accuracy_reward/std": 0.4977337718009949, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9631696343421936, + "rewards/tag_count_reward/std": 0.15247619152069092, + "step": 4636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1102.040283203125, + "completions/mean_terminated_length": 823.1734008789062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9881199722976932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12121189322529195, + "kl": 0.02593994140625, + "learning_rate": 1.0040450578460779e-07, + "loss": 0.0297, + "num_tokens": 2513800559.0, + "reward": 2.3325893878936768, + "reward_std": 0.434671550989151, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.11119430512189865, + "step": 4637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 937.21435546875, + "completions/mean_terminated_length": 728.0211791992188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9883330669649992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14604665733245753, + "kl": 0.03173828125, + "learning_rate": 1.0039043913560035e-07, + "loss": 0.1058, + "num_tokens": 2514289039.0, + "reward": 2.4949777126312256, + "reward_std": 0.44186243414878845, + "rewards/accuracy_reward/mean": 0.5959821343421936, + "rewards/accuracy_reward/std": 0.49124953150749207, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13622933626174927, + "step": 4638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1084.466552734375, + "completions/mean_terminated_length": 858.8457641601562, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9885461616323051, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.19393625726608374, + "kl": 0.02923583984375, + "learning_rate": 1.0037662131107016e-07, + "loss": 0.1093, + "num_tokens": 2514845280.0, + "reward": 2.4135046005249023, + "reward_std": 0.4961911141872406, + "rewards/accuracy_reward/mean": 0.5401785969734192, + "rewards/accuracy_reward/std": 0.49894022941589355, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9559151530265808, + "rewards/tag_count_reward/std": 0.17015470564365387, + "step": 4639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 1055.046875, + "completions/mean_terminated_length": 822.5372314453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9887592562996111, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11947982399467978, + "kl": 0.0230712890625, + "learning_rate": 1.0036305231866438e-07, + "loss": 0.0543, + "num_tokens": 2515390485.0, + "reward": 2.4603796005249023, + "reward_std": 0.3660542368888855, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.49958035349845886, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.11440251022577286, + "step": 4640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 998.04248046875, + "completions/mean_terminated_length": 769.790771484375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.988972350966917, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1303225026000214, + "kl": 0.0291748046875, + "learning_rate": 1.0034973216589234e-07, + "loss": 0.077, + "num_tokens": 2515906584.0, + "reward": 2.2857143878936768, + "reward_std": 0.4210096001625061, + "rewards/accuracy_reward/mean": 0.4129464328289032, + "rewards/accuracy_reward/std": 0.49291375279426575, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9575892686843872, + "rewards/tag_count_reward/std": 0.16685132682323456, + "step": 4641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1059.341552734375, + "completions/mean_terminated_length": 803.8455200195312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.989185445634223, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1254670143802508, + "kl": 0.027740478515625, + "learning_rate": 1.0033666086012573e-07, + "loss": 0.0727, + "num_tokens": 2516448321.0, + "reward": 2.411830425262451, + "reward_std": 0.4222189784049988, + "rewards/accuracy_reward/mean": 0.5200892686843872, + "rewards/accuracy_reward/std": 0.5001547932624817, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.12648425996303558, + "step": 4642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1033.19873046875, + "completions/mean_terminated_length": 781.6183471679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.989398540301529, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14364224331612194, + "kl": 0.0291748046875, + "learning_rate": 1.0032383840859857e-07, + "loss": 0.0845, + "num_tokens": 2516982058.0, + "reward": 2.428013563156128, + "reward_std": 0.5446627736091614, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9525669813156128, + "rewards/tag_count_reward/std": 0.17413607239723206, + "step": 4643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1066.47998046875, + "completions/mean_terminated_length": 805.8502807617188, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9896116349688349, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1189858741285944, + "kl": 0.026763916015625, + "learning_rate": 1.003112648184071e-07, + "loss": 0.0607, + "num_tokens": 2517531297.0, + "reward": 2.353794813156128, + "reward_std": 0.40211236476898193, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9017857313156128, + "rewards/format_reward/std": 0.2979368567466736, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.10528324544429779, + "step": 4644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1058.84375, + "completions/mean_terminated_length": 813.6211547851562, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9898247296361409, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11514824755009852, + "kl": 0.02667236328125, + "learning_rate": 1.0029894009650974e-07, + "loss": 0.0529, + "num_tokens": 2518082235.0, + "reward": 2.3978796005249023, + "reward_std": 0.39937183260917664, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9782366156578064, + "rewards/tag_count_reward/std": 0.10681798309087753, + "step": 4645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 933.7344360351562, + "completions/mean_terminated_length": 777.7938842773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9900378243034468, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12217828664081505, + "kl": 0.030426025390625, + "learning_rate": 1.002868642497274e-07, + "loss": 0.0751, + "num_tokens": 2518569012.0, + "reward": 2.59375, + "reward_std": 0.41784849762916565, + "rewards/accuracy_reward/mean": 0.7008928656578064, + "rewards/accuracy_reward/std": 0.45837873220443726, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.15090014040470123, + "step": 4646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1039.622802734375, + "completions/mean_terminated_length": 749.8591918945312, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9902509189707528, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13056884542073685, + "kl": 0.025787353515625, + "learning_rate": 1.002750372847431e-07, + "loss": 0.1193, + "num_tokens": 2519103355.0, + "reward": 2.4140625, + "reward_std": 0.37828412652015686, + "rewards/accuracy_reward/mean": 0.4866071343421936, + "rewards/accuracy_reward/std": 0.5003793835639954, + "rewards/format_reward/mean": 0.9508928656578064, + "rewards/format_reward/std": 0.2163332849740982, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11709482222795486, + "step": 4647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1040.7991943359375, + "completions/mean_terminated_length": 844.7306518554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9904640136380587, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.12949923624707887, + "kl": 0.025970458984375, + "learning_rate": 1.0026345920810216e-07, + "loss": 0.0687, + "num_tokens": 2519636305.0, + "reward": 2.43359375, + "reward_std": 0.5189981460571289, + "rewards/accuracy_reward/mean": 0.5424107313156128, + "rewards/accuracy_reward/std": 0.49875500798225403, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.9670758843421936, + "rewards/tag_count_reward/std": 0.1395251452922821, + "step": 4648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1027.8504638671875, + "completions/mean_terminated_length": 742.2085571289062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9906771083053647, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1276057211297755, + "kl": 0.026641845703125, + "learning_rate": 1.0025213002621215e-07, + "loss": 0.0685, + "num_tokens": 2520170670.0, + "reward": 2.392299175262451, + "reward_std": 0.4549683928489685, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342794418335, + "rewards/format_reward/mean": 0.9151785969734192, + "rewards/format_reward/std": 0.2789272665977478, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14515583217144012, + "step": 4649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 908.4308471679688, + "completions/mean_terminated_length": 738.9564208984375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9908902029726706, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13808139649512807, + "kl": 0.029754638671875, + "learning_rate": 1.0024104974534288e-07, + "loss": 0.0539, + "num_tokens": 2520644767.0, + "reward": 2.4386162757873535, + "reward_std": 0.3968634009361267, + "rewards/accuracy_reward/mean": 0.5446428656578064, + "rewards/accuracy_reward/std": 0.49855971336364746, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.12132563441991806, + "step": 4650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1019.857177734375, + "completions/mean_terminated_length": 772.0775146484375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9911032976399765, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.13312911762656093, + "kl": 0.02691650390625, + "learning_rate": 1.0023021837162648e-07, + "loss": 0.0777, + "num_tokens": 2521178495.0, + "reward": 2.3208706378936768, + "reward_std": 0.4349842369556427, + "rewards/accuracy_reward/mean": 0.4285714328289032, + "rewards/accuracy_reward/std": 0.49542489647865295, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.14611591398715973, + "step": 4651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1020.9576416015625, + "completions/mean_terminated_length": 824.2898559570312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9913163923072825, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12001805449988731, + "kl": 0.025970458984375, + "learning_rate": 1.0021963591105725e-07, + "loss": 0.1046, + "num_tokens": 2521706700.0, + "reward": 2.47265625, + "reward_std": 0.4653957486152649, + "rewards/accuracy_reward/mean": 0.5848214030265808, + "rewards/accuracy_reward/std": 0.49330365657806396, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9637276530265808, + "rewards/tag_count_reward/std": 0.15578390657901764, + "step": 4652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 980.372802734375, + "completions/mean_terminated_length": 762.25537109375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9915294869745884, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12421617497083236, + "kl": 0.0296630859375, + "learning_rate": 1.0020930236949182e-07, + "loss": 0.0619, + "num_tokens": 2522220323.0, + "reward": 2.44140625, + "reward_std": 0.34071144461631775, + "rewards/accuracy_reward/mean": 0.5178571343421936, + "rewards/accuracy_reward/std": 0.5002396702766418, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9815848469734192, + "rewards/tag_count_reward/std": 0.10210946202278137, + "step": 4653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 1061.47998046875, + "completions/mean_terminated_length": 766.9536743164062, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9917425816418944, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1286897519914757, + "kl": 0.0269775390625, + "learning_rate": 1.0019921775264897e-07, + "loss": 0.0737, + "num_tokens": 2522767226.0, + "reward": 2.458705425262451, + "reward_std": 0.468057245016098, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.1502326875925064, + "step": 4654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 1034.747802734375, + "completions/mean_terminated_length": 797.48486328125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9919556763092003, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13345709151656257, + "kl": 0.0274658203125, + "learning_rate": 1.0018938206610979e-07, + "loss": 0.0703, + "num_tokens": 2523301769.0, + "reward": 2.35546875, + "reward_std": 0.4986642599105835, + "rewards/accuracy_reward/mean": 0.5022321343421936, + "rewards/accuracy_reward/std": 0.5005539655685425, + "rewards/format_reward/mean": 0.8883928656578064, + "rewards/format_reward/std": 0.31523454189300537, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.141964390873909, + "step": 4655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 903.05810546875, + "completions/mean_terminated_length": 736.1483154296875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9921687709765064, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12642127426653382, + "kl": 0.030731201171875, + "learning_rate": 1.0017979531531753e-07, + "loss": 0.0672, + "num_tokens": 2523773379.0, + "reward": 2.5535714626312256, + "reward_std": 0.35648635029792786, + "rewards/accuracy_reward/mean": 0.6517857313156128, + "rewards/accuracy_reward/std": 0.476936936378479, + "rewards/format_reward/mean": 0.9263392686843872, + "rewards/format_reward/std": 0.2615099549293518, + "rewards/tag_count_reward/mean": 0.9754464030265808, + "rewards/tag_count_reward/std": 0.12270178645849228, + "step": 4656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 965.4464721679688, + "completions/mean_terminated_length": 785.0208740234375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.9923818656438123, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1414723323529596, + "kl": 0.028839111328125, + "learning_rate": 1.0017045750557779e-07, + "loss": 0.0787, + "num_tokens": 2524279579.0, + "reward": 2.459263563156128, + "reward_std": 0.4104117751121521, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12605296075344086, + "step": 4657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 870.4576416015625, + "completions/mean_terminated_length": 705.6615600585938, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.9925949603111183, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13522805651467137, + "kl": 0.0321044921875, + "learning_rate": 1.001613686420583e-07, + "loss": 0.0695, + "num_tokens": 2524733256.0, + "reward": 2.5831475257873535, + "reward_std": 0.40559300780296326, + "rewards/accuracy_reward/mean": 0.6785714030265808, + "rewards/accuracy_reward/std": 0.4675469994544983, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.2651226818561554, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10854540765285492, + "step": 4658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1012.3192138671875, + "completions/mean_terminated_length": 744.6713256835938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9928080549784242, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.15724317004364333, + "kl": 0.03277587890625, + "learning_rate": 1.0015252872978905e-07, + "loss": 0.0637, + "num_tokens": 2525257095.0, + "reward": 2.4581475257873535, + "reward_std": 0.3580390512943268, + "rewards/accuracy_reward/mean": 0.5379464030265808, + "rewards/accuracy_reward/std": 0.49911534786224365, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9760044813156128, + "rewards/tag_count_reward/std": 0.12109260261058807, + "step": 4659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 995.3683471679688, + "completions/mean_terminated_length": 773.462158203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9930211496457301, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.15067789642282367, + "kl": 0.032745361328125, + "learning_rate": 1.0014393777366226e-07, + "loss": 0.0927, + "num_tokens": 2525775932.0, + "reward": 2.416294813156128, + "reward_std": 0.3938027620315552, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21160738170146942, + "rewards/tag_count_reward/mean": 0.9743303656578064, + "rewards/tag_count_reward/std": 0.1302192062139511, + "step": 4660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1044.2701416015625, + "completions/mean_terminated_length": 842.44775390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9932342443130361, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12689424555589238, + "kl": 0.029296875, + "learning_rate": 1.0013559577843237e-07, + "loss": 0.0803, + "num_tokens": 2526316437.0, + "reward": 2.3677456378936768, + "reward_std": 0.5028325319290161, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.8995535969734192, + "rewards/format_reward/std": 0.30093035101890564, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.15587201714515686, + "step": 4661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 955.4420166015625, + "completions/mean_terminated_length": 766.6754150390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.993447338980342, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.13034694485164006, + "kl": 0.032440185546875, + "learning_rate": 1.0012750274871603e-07, + "loss": 0.0551, + "num_tokens": 2526816875.0, + "reward": 2.4949777126312256, + "reward_std": 0.521579384803772, + "rewards/accuracy_reward/mean": 0.6205357313156128, + "rewards/accuracy_reward/std": 0.48579615354537964, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.9659598469734192, + "rewards/tag_count_reward/std": 0.1508246213197708, + "step": 4662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 993.1116333007812, + "completions/mean_terminated_length": 804.3421630859375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.993660433647648, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.13244162705370577, + "kl": 0.02734375, + "learning_rate": 1.0011965868899214e-07, + "loss": 0.0852, + "num_tokens": 2527332877.0, + "reward": 2.3800225257873535, + "reward_std": 0.41356196999549866, + "rewards/accuracy_reward/mean": 0.4776785671710968, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9693080186843872, + "rewards/tag_count_reward/std": 0.1339094191789627, + "step": 4663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 979.8348388671875, + "completions/mean_terminated_length": 795.28271484375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9938735283149539, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1314969423834263, + "kl": 0.029632568359375, + "learning_rate": 1.0011206360360177e-07, + "loss": 0.0186, + "num_tokens": 2527840739.0, + "reward": 2.4693081378936768, + "reward_std": 0.4105337858200073, + "rewards/accuracy_reward/mean": 0.6203703880310059, + "rewards/accuracy_reward/std": 0.48585736751556396, + "rewards/format_reward/mean": 0.8973214030265808, + "rewards/format_reward/std": 0.30387789011001587, + "rewards/tag_count_reward/mean": 0.9737723469734192, + "rewards/tag_count_reward/std": 0.11222106218338013, + "step": 4664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 983.0670166015625, + "completions/mean_terminated_length": 834.030517578125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9940866229822599, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.3185893363923411, + "kl": 0.02874755859375, + "learning_rate": 1.0010471749674815e-07, + "loss": 0.0655, + "num_tokens": 2528356449.0, + "reward": 2.3158483505249023, + "reward_std": 0.43175843358039856, + "rewards/accuracy_reward/mean": 0.3995535671710968, + "rewards/accuracy_reward/std": 0.49035418033599854, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11589459329843521, + "step": 4665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 868.2678833007812, + "completions/mean_terminated_length": 723.3884887695312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9942997176495658, + "frac_reward_zero_std": 0.1785714328289032, + "grad_norm": 0.1288605359348323, + "kl": 0.029541015625, + "learning_rate": 1.0009762037249691e-07, + "loss": 0.0632, + "num_tokens": 2528814409.0, + "reward": 2.552455425262451, + "reward_std": 0.3744348883628845, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.48843589425086975, + "rewards/format_reward/mean": 0.9620535969734192, + "rewards/format_reward/std": 0.191280335187912, + "rewards/tag_count_reward/mean": 0.9810267686843872, + "rewards/tag_count_reward/std": 0.11671038717031479, + "step": 4666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 814.7210083007812, + "completions/mean_terminated_length": 652.7752685546875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9945128123168717, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1412322653176845, + "kl": 0.0333251953125, + "learning_rate": 1.0009077223477566e-07, + "loss": 0.0584, + "num_tokens": 2529248620.0, + "reward": 2.5435268878936768, + "reward_std": 0.33847787976264954, + "rewards/accuracy_reward/mean": 0.6227678656578064, + "rewards/accuracy_reward/std": 0.48523563146591187, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11096391081809998, + "step": 4667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1060.571533203125, + "completions/mean_terminated_length": 825.9889526367188, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9947259069841777, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12865179422458642, + "kl": 0.024078369140625, + "learning_rate": 1.0008417308737438e-07, + "loss": 0.0861, + "num_tokens": 2529800700.0, + "reward": 2.3035714626312256, + "reward_std": 0.41533833742141724, + "rewards/accuracy_reward/mean": 0.3973214328289032, + "rewards/accuracy_reward/std": 0.48989057540893555, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9642857313156128, + "rewards/tag_count_reward/std": 0.1624998301267624, + "step": 4668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 915.30810546875, + "completions/mean_terminated_length": 756.788818359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9949390016514836, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.12364860070357474, + "kl": 0.026885986328125, + "learning_rate": 1.0007782293394515e-07, + "loss": 0.0372, + "num_tokens": 2530278086.0, + "reward": 2.5027902126312256, + "reward_std": 0.38372913002967834, + "rewards/accuracy_reward/mean": 0.5669642686843872, + "rewards/accuracy_reward/std": 0.4960494041442871, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.11601705849170685, + "step": 4669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 951.0313110351562, + "completions/mean_terminated_length": 768.203125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9951520963187896, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11027513433627976, + "kl": 0.026397705078125, + "learning_rate": 1.0007172177800232e-07, + "loss": 0.0746, + "num_tokens": 2530782452.0, + "reward": 2.5, + "reward_std": 0.37320226430892944, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9553571343421936, + "rewards/format_reward/std": 0.2067493349313736, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.1042405441403389, + "step": 4670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1149.7857666015625, + "completions/mean_terminated_length": 888.3458251953125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9953651909860955, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.11379161884338838, + "kl": 0.021942138671875, + "learning_rate": 1.0006586962292245e-07, + "loss": 0.0925, + "num_tokens": 2531373844.0, + "reward": 2.3828125, + "reward_std": 0.5054367184638977, + "rewards/accuracy_reward/mean": 0.4888392984867096, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9241071343421936, + "rewards/format_reward/std": 0.265122652053833, + "rewards/tag_count_reward/mean": 0.9698660969734192, + "rewards/tag_count_reward/std": 0.14262787997722626, + "step": 4671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1013.1406860351562, + "completions/mean_terminated_length": 850.0232543945312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9955782856534016, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.11989765254294334, + "kl": 0.0294189453125, + "learning_rate": 1.0006026647194422e-07, + "loss": 0.0761, + "num_tokens": 2531894163.0, + "reward": 2.5128350257873535, + "reward_std": 0.4939660429954529, + "rewards/accuracy_reward/mean": 0.6473214030265808, + "rewards/accuracy_reward/std": 0.4783378839492798, + "rewards/format_reward/mean": 0.9084821343421936, + "rewards/format_reward/std": 0.2886664867401123, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.161164328455925, + "step": 4672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1015.5223388671875, + "completions/mean_terminated_length": 780.73974609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9957913803207075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1483818037775009, + "kl": 0.028076171875, + "learning_rate": 1.000549123281685e-07, + "loss": 0.1088, + "num_tokens": 2532421309.0, + "reward": 2.4776787757873535, + "reward_std": 0.3812069296836853, + "rewards/accuracy_reward/mean": 0.5602678656578064, + "rewards/accuracy_reward/std": 0.49690937995910645, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9866071343421936, + "rewards/tag_count_reward/std": 0.08420762419700623, + "step": 4673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 932.466552734375, + "completions/mean_terminated_length": 782.787353515625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9960044749880135, + "frac_reward_zero_std": 0.1428571492433548, + "grad_norm": 0.1409235280739369, + "kl": 0.03118896484375, + "learning_rate": 1.0004980719455852e-07, + "loss": 0.0547, + "num_tokens": 2532905598.0, + "reward": 2.4720983505249023, + "reward_std": 0.39919596910476685, + "rewards/accuracy_reward/mean": 0.5691964030265808, + "rewards/accuracy_reward/std": 0.4957422614097595, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9720982313156128, + "rewards/tag_count_reward/std": 0.1219823881983757, + "step": 4674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 954.8795166015625, + "completions/mean_terminated_length": 769.3629150390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9962175696553194, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.12562608925846927, + "kl": 0.0291748046875, + "learning_rate": 1.0004495107393944e-07, + "loss": 0.0541, + "num_tokens": 2533406760.0, + "reward": 2.486049175262451, + "reward_std": 0.35308435559272766, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.1057254895567894, + "step": 4675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 820.482177734375, + "completions/mean_terminated_length": 722.872314453125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9964306643226253, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.14587879980622798, + "kl": 0.0325927734375, + "learning_rate": 1.0004034396899885e-07, + "loss": 0.0916, + "num_tokens": 2533846608.0, + "reward": 2.564732313156128, + "reward_std": 0.40221813321113586, + "rewards/accuracy_reward/mean": 0.6540178656578064, + "rewards/accuracy_reward/std": 0.47621920704841614, + "rewards/format_reward/mean": 0.9330357313156128, + "rewards/format_reward/std": 0.2502395808696747, + "rewards/tag_count_reward/mean": 0.9776785969734192, + "rewards/tag_count_reward/std": 0.1136813759803772, + "step": 4676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 944.2388916015625, + "completions/mean_terminated_length": 725.8475952148438, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9966437589899313, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.13062411935876078, + "kl": 0.028045654296875, + "learning_rate": 1.0003598588228639e-07, + "loss": 0.0788, + "num_tokens": 2534335275.0, + "reward": 2.47265625, + "reward_std": 0.3991635739803314, + "rewards/accuracy_reward/mean": 0.5558035969734192, + "rewards/accuracy_reward/std": 0.4974316358566284, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24233205616474152, + "rewards/tag_count_reward/mean": 0.9793526530265808, + "rewards/tag_count_reward/std": 0.10962118953466415, + "step": 4677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 960.44873046875, + "completions/mean_terminated_length": 759.0502319335938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9968568536572372, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1454670215034997, + "kl": 0.029815673828125, + "learning_rate": 1.0003187681621395e-07, + "loss": 0.091, + "num_tokens": 2534839492.0, + "reward": 2.4659600257873535, + "reward_std": 0.4732803404331207, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9174107313156128, + "rewards/format_reward/std": 0.2755681276321411, + "rewards/tag_count_reward/mean": 0.9704241156578064, + "rewards/tag_count_reward/std": 0.13927440345287323, + "step": 4678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1062.759033203125, + "completions/mean_terminated_length": 804.6535034179688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9970699483245432, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12514435583075872, + "kl": 0.026092529296875, + "learning_rate": 1.0002801677305553e-07, + "loss": 0.0614, + "num_tokens": 2535379688.0, + "reward": 2.4051339626312256, + "reward_std": 0.44050559401512146, + "rewards/accuracy_reward/mean": 0.4955357015132904, + "rewards/accuracy_reward/std": 0.5005390048027039, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9654017686843872, + "rewards/tag_count_reward/std": 0.15299931168556213, + "step": 4679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 999.6652221679688, + "completions/mean_terminated_length": 824.9427490234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9972830429918491, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1267200484425361, + "kl": 0.02752685546875, + "learning_rate": 1.000244057549474e-07, + "loss": 0.0974, + "num_tokens": 2535905234.0, + "reward": 2.4213171005249023, + "reward_std": 0.45943737030029297, + "rewards/accuracy_reward/mean": 0.5223214030265808, + "rewards/accuracy_reward/std": 0.5000599026679993, + "rewards/format_reward/mean": 0.9308035969734192, + "rewards/format_reward/std": 0.25407159328460693, + "rewards/tag_count_reward/mean": 0.9681919813156128, + "rewards/tag_count_reward/std": 0.14469929039478302, + "step": 4680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 955.93310546875, + "completions/mean_terminated_length": 760.5105590820312, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.9974961376591551, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.14216226081341846, + "kl": 0.0277099609375, + "learning_rate": 1.0002104376388805e-07, + "loss": 0.1024, + "num_tokens": 2536407332.0, + "reward": 2.4112725257873535, + "reward_std": 0.44071486592292786, + "rewards/accuracy_reward/mean": 0.5370370149612427, + "rewards/accuracy_reward/std": 0.49920448660850525, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13440261781215668, + "step": 4681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 780.4620971679688, + "completions/mean_terminated_length": 682.9591674804688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.997709232326461, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.1403292276309954, + "kl": 0.0352783203125, + "learning_rate": 1.0001793080173799e-07, + "loss": 0.0491, + "num_tokens": 2536827427.0, + "reward": 2.625, + "reward_std": 0.39604291319847107, + "rewards/accuracy_reward/mean": 0.7366071343421936, + "rewards/accuracy_reward/std": 0.44096609950065613, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26866820454597473, + "rewards/tag_count_reward/mean": 0.9665178656578064, + "rewards/tag_count_reward/std": 0.13058780133724213, + "step": 4682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1039.7835693359375, + "completions/mean_terminated_length": 813.8988647460938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9979223269937669, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12561680779151446, + "kl": 0.025604248046875, + "learning_rate": 1.0001506687022001e-07, + "loss": 0.0466, + "num_tokens": 2537368466.0, + "reward": 2.361607313156128, + "reward_std": 0.42108580470085144, + "rewards/accuracy_reward/mean": 0.4620535671710968, + "rewards/accuracy_reward/std": 0.49911531805992126, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.12733516097068787, + "step": 4683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 929.013427734375, + "completions/mean_terminated_length": 728.7737426757812, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9981354216610729, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.1337003579641795, + "kl": 0.0289306640625, + "learning_rate": 1.0001245197091917e-07, + "loss": 0.0434, + "num_tokens": 2537850248.0, + "reward": 2.4871652126312256, + "reward_std": 0.3668181300163269, + "rewards/accuracy_reward/mean": 0.5491071343421936, + "rewards/accuracy_reward/std": 0.4981389045715332, + "rewards/format_reward/mean": 0.9575892686843872, + "rewards/format_reward/std": 0.20174959301948547, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.10854540765285492, + "step": 4684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1093.649658203125, + "completions/mean_terminated_length": 833.3721923828125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9983485163283788, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.11087445682988281, + "kl": 0.023651123046875, + "learning_rate": 1.0001008610528253e-07, + "loss": 0.0371, + "num_tokens": 2538411051.0, + "reward": 2.3443081378936768, + "reward_std": 0.4391659200191498, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.49835437536239624, + "rewards/format_reward/mean": 0.9196428656578064, + "rewards/format_reward/std": 0.2721492052078247, + "rewards/tag_count_reward/mean": 0.9715401530265808, + "rewards/tag_count_reward/std": 0.13646738231182098, + "step": 4685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 991.7053833007812, + "completions/mean_terminated_length": 792.7745361328125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9985616109956849, + "frac_reward_zero_std": 0.1071428656578064, + "grad_norm": 0.12353653238258114, + "kl": 0.026458740234375, + "learning_rate": 1.0000796927461941e-07, + "loss": 0.0295, + "num_tokens": 2538923063.0, + "reward": 2.5145089626312256, + "reward_std": 0.4036351144313812, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49168136715888977, + "rewards/format_reward/mean": 0.9419642686843872, + "rewards/format_reward/std": 0.23407234251499176, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.10627461224794388, + "step": 4686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 993.0022583007812, + "completions/mean_terminated_length": 724.0812377929688, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9987747056629908, + "frac_reward_zero_std": 0.0714285746216774, + "grad_norm": 0.12458068262714309, + "kl": 0.03131103515625, + "learning_rate": 1.0000610148010136e-07, + "loss": 0.0352, + "num_tokens": 2539431800.0, + "reward": 2.4190850257873535, + "reward_std": 0.37573495507240295, + "rewards/accuracy_reward/mean": 0.5111607313156128, + "rewards/accuracy_reward/std": 0.5004342198371887, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.2463276982307434, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.12037914246320724, + "step": 4687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 890.85498046875, + "completions/mean_terminated_length": 738.9065551757812, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9989878003302968, + "frac_reward_zero_std": 0.2857142984867096, + "grad_norm": 0.11769463533609018, + "kl": 0.02886962890625, + "learning_rate": 1.0000448272276205e-07, + "loss": 0.0468, + "num_tokens": 2539904663.0, + "reward": 2.5044643878936768, + "reward_std": 0.35532113909721375, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4944108724594116, + "rewards/format_reward/mean": 0.9441964030265808, + "rewards/format_reward/std": 0.22979861497879028, + "rewards/tag_count_reward/mean": 0.9821428656578064, + "rewards/tag_count_reward/std": 0.10818972438573837, + "step": 4688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 946.6317138671875, + "completions/mean_terminated_length": 769.7279663085938, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.9992008949976027, + "frac_reward_zero_std": 0.2142857313156128, + "grad_norm": 0.11571966958610877, + "kl": 0.026031494140625, + "learning_rate": 1.0000311300349733e-07, + "loss": 0.0535, + "num_tokens": 2540404066.0, + "reward": 2.4933037757873535, + "reward_std": 0.350578248500824, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49663296341896057, + "rewards/format_reward/mean": 0.9397321343421936, + "rewards/format_reward/std": 0.23824848234653473, + "rewards/tag_count_reward/mean": 0.9910714030265808, + "rewards/tag_count_reward/std": 0.07038223743438721, + "step": 4689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 806.700927734375, + "completions/mean_terminated_length": 685.0049438476562, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9994139896649087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14513044595683983, + "kl": 0.030975341796875, + "learning_rate": 1.0000199232306522e-07, + "loss": 0.1109, + "num_tokens": 2540838044.0, + "reward": 2.4854912757873535, + "reward_std": 0.4160224497318268, + "rewards/accuracy_reward/mean": 0.5714285969734192, + "rewards/accuracy_reward/std": 0.49542486667633057, + "rewards/format_reward/mean": 0.9352678656578064, + "rewards/format_reward/std": 0.24632768332958221, + "rewards/tag_count_reward/mean": 0.9787946343421936, + "rewards/tag_count_reward/std": 0.11389531940221786, + "step": 4690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1119.8013916015625, + "completions/mean_terminated_length": 866.65625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9996270843322146, + "frac_reward_zero_std": 0.0357142873108387, + "grad_norm": 0.1119688690457679, + "kl": 0.0234375, + "learning_rate": 1.0000112068208598e-07, + "loss": 0.0653, + "num_tokens": 2541412995.0, + "reward": 2.3487725257873535, + "reward_std": 0.4741098880767822, + "rewards/accuracy_reward/mean": 0.4553571343421936, + "rewards/accuracy_reward/std": 0.49855977296829224, + "rewards/format_reward/mean": 0.9285714030265808, + "rewards/format_reward/std": 0.2578272819519043, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.15056781470775604, + "step": 4691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 967.9464721679688, + "completions/mean_terminated_length": 791.2103881835938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9998401789995205, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1208217601468528, + "kl": 0.024871826171875, + "learning_rate": 1.0000049808104197e-07, + "loss": 0.082, + "num_tokens": 2541918203.0, + "reward": 2.5044643878936768, + "reward_std": 0.33556851744651794, + "rewards/accuracy_reward/mean": 0.5736607313156128, + "rewards/accuracy_reward/std": 0.49509719014167786, + "rewards/format_reward/mean": 0.9598214030265808, + "rewards/format_reward/std": 0.1965973675251007, + "rewards/tag_count_reward/mean": 0.9709821343421936, + "rewards/tag_count_reward/std": 0.14088858664035797, + "step": 4692 + }, + { + "epoch": 0.9998401789995205, + "step": 4692, + "total_flos": 0.0, + "train_loss": 0.08478201504697423, + "train_runtime": 144692.0244, + "train_samples_per_second": 0.908, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1, + "max_steps": 4693, + "num_input_tokens_seen": 2541918203, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}