Invalid JSON:
Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 4491, | |
| "best_metric": 0.41118884086608887, | |
| "best_model_checkpoint": "models/grpo_toxic_qwen/checkpoint-4491", | |
| "epoch": 0.9996661101836394, | |
| "eval_steps": 2696, | |
| "global_step": 4491, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.890625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 106.0, | |
| "completions/mean_length": 119.59375, | |
| "completions/mean_terminated_length": 51.142860412597656, | |
| "completions/min_length": 15.0, | |
| "completions/min_terminated_length": 15.0, | |
| "epoch": 0.00022259321090706732, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.030076026916504, | |
| "kl": 3.605344500101637e-05, | |
| "learning_rate": 0.0, | |
| "loss": -0.0286, | |
| "num_tokens": 9462.0, | |
| "reward": -6.696479797363281, | |
| "reward_std": 2.205897808074951, | |
| "rewards/RewardModelWrapper/mean": -6.696479797363281, | |
| "rewards/RewardModelWrapper/std": 2.596616506576538, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00045590819666228654, | |
| "clip_ratio/high_mean": 0.00045590819666228654, | |
| "clip_ratio/low_mean": 9.893491918848333e-05, | |
| "clip_ratio/low_min": 9.893491918848333e-05, | |
| "clip_ratio/region_mean": 0.0005548431188205485, | |
| "completions/clipped_ratio": 0.91015625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 115.375, | |
| "completions/mean_length": 124.541015625, | |
| "completions/mean_terminated_length": 88.15992164611816, | |
| "completions/min_length": 53.8125, | |
| "completions/min_terminated_length": 53.8125, | |
| "epoch": 0.011129660545353366, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.771069526672363, | |
| "kl": 0.0014390296781742536, | |
| "learning_rate": 7.350000000000001e-07, | |
| "loss": -0.0097, | |
| "num_tokens": 164224.0, | |
| "reward": -6.273432105779648, | |
| "reward_std": 2.3787402510643005, | |
| "rewards/RewardModelWrapper/mean": -6.273432105779648, | |
| "rewards/RewardModelWrapper/std": 3.4789108261466026, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0075913356387172825, | |
| "clip_ratio/high_mean": 0.0075913356387172825, | |
| "clip_ratio/low_mean": 0.003807623453612905, | |
| "clip_ratio/low_min": 0.003807623453612905, | |
| "clip_ratio/region_mean": 0.011398959086218383, | |
| "completions/clipped_ratio": 0.8915441176470589, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 110.17647058823529, | |
| "completions/mean_length": 123.2251838235294, | |
| "completions/mean_terminated_length": 81.49435559441062, | |
| "completions/min_length": 44.470588235294116, | |
| "completions/min_terminated_length": 44.470588235294116, | |
| "epoch": 0.022259321090706732, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.885649681091309, | |
| "kl": 0.019224860495887695, | |
| "learning_rate": 1.485e-06, | |
| "loss": -0.0113, | |
| "num_tokens": 327613.0, | |
| "reward": -5.39674503663007, | |
| "reward_std": 2.7843008882859173, | |
| "rewards/RewardModelWrapper/mean": -5.39674503663007, | |
| "rewards/RewardModelWrapper/std": 3.8948283475988053, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01675744824227877, | |
| "clip_ratio/high_mean": 0.01675744824227877, | |
| "clip_ratio/low_mean": 0.012073511610215065, | |
| "clip_ratio/low_min": 0.012073511610215065, | |
| "clip_ratio/region_mean": 0.028830959817860276, | |
| "completions/clipped_ratio": 0.9091796875, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 107.3125, | |
| "completions/mean_length": 124.3720703125, | |
| "completions/mean_terminated_length": 81.7018609046936, | |
| "completions/min_length": 54.6875, | |
| "completions/min_terminated_length": 46.6875, | |
| "epoch": 0.0333889816360601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.0026164054870605, | |
| "kl": 0.04671986572444439, | |
| "learning_rate": 2.235e-06, | |
| "loss": 0.0052, | |
| "num_tokens": 482354.0, | |
| "reward": -5.547765076160431, | |
| "reward_std": 2.73693485558033, | |
| "rewards/RewardModelWrapper/mean": -5.547765076160431, | |
| "rewards/RewardModelWrapper/std": 3.441145323216915, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02414312065928243, | |
| "clip_ratio/high_mean": 0.02414312065928243, | |
| "clip_ratio/low_mean": 0.017463966414215975, | |
| "clip_ratio/low_min": 0.017463966414215975, | |
| "clip_ratio/region_mean": 0.04160708721727133, | |
| "completions/clipped_ratio": 0.9200367647058824, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 119.6470588235294, | |
| "completions/mean_length": 125.29503676470588, | |
| "completions/mean_terminated_length": 94.28872680664062, | |
| "completions/min_length": 65.58823529411765, | |
| "completions/min_terminated_length": 65.58823529411765, | |
| "epoch": 0.044518642181413465, | |
| "frac_reward_zero_std": 0.007352941176470588, | |
| "grad_norm": 3.9181442260742188, | |
| "kl": 0.0877579689398408, | |
| "learning_rate": 2.97e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 648123.0, | |
| "reward": -4.304881698944989, | |
| "reward_std": 3.38148234872257, | |
| "rewards/RewardModelWrapper/mean": -4.304881698944989, | |
| "rewards/RewardModelWrapper/std": 4.617817443959853, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029861916538793595, | |
| "clip_ratio/high_mean": 0.029861916538793595, | |
| "clip_ratio/low_mean": 0.023766413825796917, | |
| "clip_ratio/low_min": 0.023766413825796917, | |
| "clip_ratio/region_mean": 0.05362833026330918, | |
| "completions/clipped_ratio": 0.9172794117647058, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 116.0, | |
| "completions/mean_length": 124.69761029411765, | |
| "completions/mean_terminated_length": 89.32857289033778, | |
| "completions/min_length": 53.23529411764706, | |
| "completions/min_terminated_length": 53.23529411764706, | |
| "epoch": 0.05564830272676683, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.056563377380371, | |
| "kl": 0.14030909642577172, | |
| "learning_rate": 2.9836102890962895e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 813050.0, | |
| "reward": -4.152413817013011, | |
| "reward_std": 3.3082274689393887, | |
| "rewards/RewardModelWrapper/mean": -4.152413817013011, | |
| "rewards/RewardModelWrapper/std": 4.367143616956823, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.030323101801332086, | |
| "clip_ratio/high_mean": 0.030323101801332086, | |
| "clip_ratio/low_mean": 0.021581946768565105, | |
| "clip_ratio/low_min": 0.021581946768565105, | |
| "clip_ratio/region_mean": 0.051905048433691266, | |
| "completions/clipped_ratio": 0.9248046875, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 116.1875, | |
| "completions/mean_length": 125.275390625, | |
| "completions/mean_terminated_length": 89.46597385406494, | |
| "completions/min_length": 56.5625, | |
| "completions/min_terminated_length": 56.5625, | |
| "epoch": 0.0667779632721202, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.983767032623291, | |
| "kl": 0.1569075232744217, | |
| "learning_rate": 2.966537673571591e-06, | |
| "loss": 0.0317, | |
| "num_tokens": 969156.0, | |
| "reward": -3.388260453939438, | |
| "reward_std": 3.2063718885183334, | |
| "rewards/RewardModelWrapper/mean": -3.388260453939438, | |
| "rewards/RewardModelWrapper/std": 4.789341554045677, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027688504084944724, | |
| "clip_ratio/high_mean": 0.027688504084944724, | |
| "clip_ratio/low_mean": 0.019530020136153327, | |
| "clip_ratio/low_min": 0.019530020136153327, | |
| "clip_ratio/region_mean": 0.04721852412912995, | |
| "completions/clipped_ratio": 0.9191176470588235, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 120.05882352941177, | |
| "completions/mean_length": 124.81709558823529, | |
| "completions/mean_terminated_length": 90.29201911477482, | |
| "completions/min_length": 52.64705882352941, | |
| "completions/min_terminated_length": 52.64705882352941, | |
| "epoch": 0.07790762381747357, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.5877625942230225, | |
| "kl": 0.1643798241391778, | |
| "learning_rate": 2.9494650580468926e-06, | |
| "loss": 0.0293, | |
| "num_tokens": 1134229.0, | |
| "reward": -3.141713114345775, | |
| "reward_std": 3.3975463895236744, | |
| "rewards/RewardModelWrapper/mean": -3.141713114345775, | |
| "rewards/RewardModelWrapper/std": 4.820348431082333, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.028169492546003313, | |
| "clip_ratio/high_mean": 0.028169492546003313, | |
| "clip_ratio/low_mean": 0.019790295051643626, | |
| "clip_ratio/low_min": 0.019790295051643626, | |
| "clip_ratio/region_mean": 0.04795978774316609, | |
| "completions/clipped_ratio": 0.9310661764705882, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 108.11764705882354, | |
| "completions/mean_length": 125.64889705882354, | |
| "completions/mean_terminated_length": 87.01379753561581, | |
| "completions/min_length": 62.1764705882353, | |
| "completions/min_terminated_length": 54.64705882352941, | |
| "epoch": 0.08903728436282693, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.695188999176025, | |
| "kl": 0.30343326754868033, | |
| "learning_rate": 2.933416799453676e-06, | |
| "loss": 0.0748, | |
| "num_tokens": 1300167.0, | |
| "reward": -3.474738233229693, | |
| "reward_std": 3.482299538219676, | |
| "rewards/RewardModelWrapper/mean": -3.474738233229693, | |
| "rewards/RewardModelWrapper/std": 4.745730189716115, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.029925933612976224, | |
| "clip_ratio/high_mean": 0.029925933612976224, | |
| "clip_ratio/low_mean": 0.019293442433699966, | |
| "clip_ratio/low_min": 0.019293442433699966, | |
| "clip_ratio/region_mean": 0.04921937589067966, | |
| "completions/clipped_ratio": 0.943359375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 110.9375, | |
| "completions/mean_length": 126.4140625, | |
| "completions/mean_terminated_length": 95.56250047683716, | |
| "completions/min_length": 79.5, | |
| "completions/min_terminated_length": 71.5, | |
| "epoch": 0.1001669449081803, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.636467218399048, | |
| "kl": 0.19514246992766857, | |
| "learning_rate": 2.9163441839289777e-06, | |
| "loss": 0.0415, | |
| "num_tokens": 1457135.0, | |
| "reward": -3.0616072714328766, | |
| "reward_std": 3.3436961472034454, | |
| "rewards/RewardModelWrapper/mean": -3.0616072714328766, | |
| "rewards/RewardModelWrapper/std": 4.945626050233841, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027343249125406147, | |
| "clip_ratio/high_mean": 0.027343249125406147, | |
| "clip_ratio/low_mean": 0.01768903057440184, | |
| "clip_ratio/low_min": 0.01768903057440184, | |
| "clip_ratio/region_mean": 0.04503227963577956, | |
| "completions/clipped_ratio": 0.9393382352941176, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 109.47058823529412, | |
| "completions/mean_length": 126.07444852941177, | |
| "completions/mean_terminated_length": 92.27339037726907, | |
| "completions/min_length": 73.88235294117646, | |
| "completions/min_terminated_length": 66.3529411764706, | |
| "epoch": 0.11129660545353366, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.624467134475708, | |
| "kl": 0.19471221148967743, | |
| "learning_rate": 2.8992715684042796e-06, | |
| "loss": 0.0459, | |
| "num_tokens": 1623608.0, | |
| "reward": -3.0403577299679028, | |
| "reward_std": 3.5023320422453037, | |
| "rewards/RewardModelWrapper/mean": -3.0403577299679028, | |
| "rewards/RewardModelWrapper/std": 4.758344790514777, | |
| "step": 500 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026099461197154596, | |
| "clip_ratio/high_mean": 0.026099461197154596, | |
| "clip_ratio/low_mean": 0.01860616845311597, | |
| "clip_ratio/low_min": 0.01860616845311597, | |
| "clip_ratio/region_mean": 0.04470562972594053, | |
| "completions/clipped_ratio": 0.9209558823529411, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 111.3529411764706, | |
| "completions/mean_length": 125.32536764705883, | |
| "completions/mean_terminated_length": 86.94334905287799, | |
| "completions/min_length": 54.64705882352941, | |
| "completions/min_terminated_length": 47.11764705882353, | |
| "epoch": 0.12242626599888703, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.117679119110107, | |
| "kl": 0.1926309671998024, | |
| "learning_rate": 2.882198952879581e-06, | |
| "loss": 0.0424, | |
| "num_tokens": 1789042.0, | |
| "reward": -3.364777831470265, | |
| "reward_std": 3.6073132402756634, | |
| "rewards/RewardModelWrapper/mean": -3.364777831470265, | |
| "rewards/RewardModelWrapper/std": 4.984979461221134, | |
| "step": 550 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027654693657532335, | |
| "clip_ratio/high_mean": 0.027654693657532335, | |
| "clip_ratio/low_mean": 0.01964853117824532, | |
| "clip_ratio/low_min": 0.01964853117824532, | |
| "clip_ratio/region_mean": 0.047303224778734144, | |
| "completions/clipped_ratio": 0.8984375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 124.5, | |
| "completions/mean_length": 124.734375, | |
| "completions/mean_terminated_length": 98.57239484786987, | |
| "completions/min_length": 61.25, | |
| "completions/min_terminated_length": 61.25, | |
| "epoch": 0.1335559265442404, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.241142511367798, | |
| "kl": 0.211693402081728, | |
| "learning_rate": 2.865126337354883e-06, | |
| "loss": 0.0498, | |
| "num_tokens": 1944610.0, | |
| "reward": -2.732655808329582, | |
| "reward_std": 3.617541193962097, | |
| "rewards/RewardModelWrapper/mean": -2.732655808329582, | |
| "rewards/RewardModelWrapper/std": 4.809614151716232, | |
| "step": 600 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027527469391934574, | |
| "clip_ratio/high_mean": 0.027527469391934574, | |
| "clip_ratio/low_mean": 0.019259323065634815, | |
| "clip_ratio/low_min": 0.019259323065634815, | |
| "clip_ratio/region_mean": 0.046786792553029956, | |
| "completions/clipped_ratio": 0.8933823529411765, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 121.47058823529412, | |
| "completions/mean_length": 124.16727941176471, | |
| "completions/mean_terminated_length": 96.0017848295324, | |
| "completions/min_length": 59.23529411764706, | |
| "completions/min_terminated_length": 59.23529411764706, | |
| "epoch": 0.14468558708959378, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.524644374847412, | |
| "kl": 0.2376550894230604, | |
| "learning_rate": 2.8480537218301847e-06, | |
| "loss": 0.0528, | |
| "num_tokens": 2109128.0, | |
| "reward": -1.8917093557469986, | |
| "reward_std": 3.8112815267899456, | |
| "rewards/RewardModelWrapper/mean": -1.8917093557469986, | |
| "rewards/RewardModelWrapper/std": 5.167453260982738, | |
| "step": 650 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027425415357574822, | |
| "clip_ratio/high_mean": 0.027425415357574822, | |
| "clip_ratio/low_mean": 0.01982414353871718, | |
| "clip_ratio/low_min": 0.01982414353871718, | |
| "clip_ratio/region_mean": 0.04724955870769918, | |
| "completions/clipped_ratio": 0.8602941176470589, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 121.05882352941177, | |
| "completions/mean_length": 123.19117647058823, | |
| "completions/mean_terminated_length": 94.26595889820771, | |
| "completions/min_length": 52.0, | |
| "completions/min_terminated_length": 52.0, | |
| "epoch": 0.15581524763494714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.250056743621826, | |
| "kl": 0.22009442321956157, | |
| "learning_rate": 2.830981106305486e-06, | |
| "loss": 0.044, | |
| "num_tokens": 2272320.0, | |
| "reward": -2.427665850695442, | |
| "reward_std": 3.78492192661061, | |
| "rewards/RewardModelWrapper/mean": -2.427665850695442, | |
| "rewards/RewardModelWrapper/std": 4.859750719631419, | |
| "step": 700 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02454757507191971, | |
| "clip_ratio/high_mean": 0.02454757507191971, | |
| "clip_ratio/low_mean": 0.0160788345040055, | |
| "clip_ratio/low_min": 0.0160788345040055, | |
| "clip_ratio/region_mean": 0.04062640947755426, | |
| "completions/clipped_ratio": 0.8837890625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 117.25, | |
| "completions/mean_length": 123.8193359375, | |
| "completions/mean_terminated_length": 92.61992502212524, | |
| "completions/min_length": 56.125, | |
| "completions/min_terminated_length": 56.125, | |
| "epoch": 0.1669449081803005, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.143310070037842, | |
| "kl": 0.2187542901188135, | |
| "learning_rate": 2.8139084907807877e-06, | |
| "loss": 0.0458, | |
| "num_tokens": 2426567.0, | |
| "reward": -2.639084130525589, | |
| "reward_std": 4.0981148183345795, | |
| "rewards/RewardModelWrapper/mean": -2.639084130525589, | |
| "rewards/RewardModelWrapper/std": 5.267414927482605, | |
| "step": 750 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023827595426701008, | |
| "clip_ratio/high_mean": 0.023827595426701008, | |
| "clip_ratio/low_mean": 0.01665229408070445, | |
| "clip_ratio/low_min": 0.01665229408070445, | |
| "clip_ratio/region_mean": 0.04047988944686949, | |
| "completions/clipped_ratio": 0.9172794117647058, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 117.41176470588235, | |
| "completions/mean_length": 124.52849264705883, | |
| "completions/mean_terminated_length": 86.05495004092946, | |
| "completions/min_length": 53.294117647058826, | |
| "completions/min_terminated_length": 53.294117647058826, | |
| "epoch": 0.17807456872565386, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.82859206199646, | |
| "kl": 0.2341267079859972, | |
| "learning_rate": 2.7968358752560893e-06, | |
| "loss": 0.0495, | |
| "num_tokens": 2591422.0, | |
| "reward": -1.696559471242568, | |
| "reward_std": 4.100044530980727, | |
| "rewards/RewardModelWrapper/mean": -1.696559471242568, | |
| "rewards/RewardModelWrapper/std": 5.4215626155628875, | |
| "step": 800 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025062179565429686, | |
| "clip_ratio/high_mean": 0.025062179565429686, | |
| "clip_ratio/low_mean": 0.018277215642156078, | |
| "clip_ratio/low_min": 0.018277215642156078, | |
| "clip_ratio/region_mean": 0.04333939506206661, | |
| "completions/clipped_ratio": 0.9292279411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 113.58823529411765, | |
| "completions/mean_length": 125.45588235294117, | |
| "completions/mean_terminated_length": 91.19166834214154, | |
| "completions/min_length": 58.94117647058823, | |
| "completions/min_terminated_length": 58.94117647058823, | |
| "epoch": 0.18920422927100725, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.91913366317749, | |
| "kl": 0.22518106378614902, | |
| "learning_rate": 2.779763259731391e-06, | |
| "loss": 0.0531, | |
| "num_tokens": 2757254.0, | |
| "reward": -0.3187214837354772, | |
| "reward_std": 5.127424436457017, | |
| "rewards/RewardModelWrapper/mean": -0.3187214837354772, | |
| "rewards/RewardModelWrapper/std": 5.87655990263995, | |
| "step": 850 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02293195443926379, | |
| "clip_ratio/high_mean": 0.02293195443926379, | |
| "clip_ratio/low_mean": 0.017691890239948407, | |
| "clip_ratio/low_min": 0.017691890239948407, | |
| "clip_ratio/region_mean": 0.040623844610527156, | |
| "completions/clipped_ratio": 0.9091796875, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 119.875, | |
| "completions/mean_length": 124.306640625, | |
| "completions/mean_terminated_length": 88.5287561416626, | |
| "completions/min_length": 47.5, | |
| "completions/min_terminated_length": 47.5, | |
| "epoch": 0.2003338898163606, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.8705227375030518, | |
| "kl": 0.23920009069144726, | |
| "learning_rate": 2.7626906442066923e-06, | |
| "loss": 0.0608, | |
| "num_tokens": 2912304.0, | |
| "reward": -0.5163702219724655, | |
| "reward_std": 5.298731863498688, | |
| "rewards/RewardModelWrapper/mean": -0.5163702219724655, | |
| "rewards/RewardModelWrapper/std": 5.84825000166893, | |
| "step": 900 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02397001946810633, | |
| "clip_ratio/high_mean": 0.02397001946810633, | |
| "clip_ratio/low_mean": 0.016966249566758053, | |
| "clip_ratio/low_min": 0.016966249566758053, | |
| "clip_ratio/region_mean": 0.040936269152443854, | |
| "completions/clipped_ratio": 0.9053308823529411, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 112.17647058823529, | |
| "completions/mean_length": 124.65533088235294, | |
| "completions/mean_terminated_length": 90.35452988568474, | |
| "completions/min_length": 57.64705882352941, | |
| "completions/min_terminated_length": 57.64705882352941, | |
| "epoch": 0.21146355036171396, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.009652137756348, | |
| "kl": 0.28723155200481415, | |
| "learning_rate": 2.7456180286819943e-06, | |
| "loss": 0.0623, | |
| "num_tokens": 3077033.0, | |
| "reward": 0.3360518918317907, | |
| "reward_std": 5.125342537375057, | |
| "rewards/RewardModelWrapper/mean": 0.3360518918317907, | |
| "rewards/RewardModelWrapper/std": 5.78782990399529, | |
| "step": 950 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025908510715235023, | |
| "clip_ratio/high_mean": 0.025908510715235023, | |
| "clip_ratio/low_mean": 0.017599179263343104, | |
| "clip_ratio/low_min": 0.017599179263343104, | |
| "clip_ratio/region_mean": 0.04350769010838121, | |
| "completions/clipped_ratio": 0.9172794117647058, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 121.41176470588235, | |
| "completions/mean_length": 125.03125, | |
| "completions/mean_terminated_length": 94.79173772475299, | |
| "completions/min_length": 61.88235294117647, | |
| "completions/min_terminated_length": 61.88235294117647, | |
| "epoch": 0.22259321090706732, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.159637928009033, | |
| "kl": 0.35069699488580225, | |
| "learning_rate": 2.728545413157296e-06, | |
| "loss": 0.0847, | |
| "num_tokens": 3242123.0, | |
| "reward": 1.627946559120627, | |
| "reward_std": 4.790118554059197, | |
| "rewards/RewardModelWrapper/mean": 1.627946559120627, | |
| "rewards/RewardModelWrapper/std": 5.393552022821763, | |
| "step": 1000 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024083305108360945, | |
| "clip_ratio/high_mean": 0.024083305108360945, | |
| "clip_ratio/low_mean": 0.013416973181592766, | |
| "clip_ratio/low_min": 0.013416973181592766, | |
| "clip_ratio/region_mean": 0.0375002783536911, | |
| "completions/clipped_ratio": 0.9267578125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 112.8125, | |
| "completions/mean_length": 125.0126953125, | |
| "completions/mean_terminated_length": 87.55602884292603, | |
| "completions/min_length": 57.0, | |
| "completions/min_terminated_length": 57.0, | |
| "epoch": 0.2337228714524207, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.147945880889893, | |
| "kl": 0.3470122530311346, | |
| "learning_rate": 2.7114727976325973e-06, | |
| "loss": 0.089, | |
| "num_tokens": 3397872.0, | |
| "reward": 0.24295206367969513, | |
| "reward_std": 5.033507749438286, | |
| "rewards/RewardModelWrapper/mean": 0.24295206367969513, | |
| "rewards/RewardModelWrapper/std": 5.808434098958969, | |
| "step": 1050 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024642590049188583, | |
| "clip_ratio/high_mean": 0.024642590049188583, | |
| "clip_ratio/low_mean": 0.013819608901976609, | |
| "clip_ratio/low_min": 0.013819608901976609, | |
| "clip_ratio/region_mean": 0.0384621987817809, | |
| "completions/clipped_ratio": 0.9264705882352942, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 112.70588235294117, | |
| "completions/mean_length": 125.2408088235294, | |
| "completions/mean_terminated_length": 90.793908960679, | |
| "completions/min_length": 65.17647058823529, | |
| "completions/min_terminated_length": 65.17647058823529, | |
| "epoch": 0.24485253199777407, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.4217491149902344, | |
| "kl": 0.377669473439455, | |
| "learning_rate": 2.694400182107899e-06, | |
| "loss": 0.0977, | |
| "num_tokens": 3563110.0, | |
| "reward": 0.4081239700317383, | |
| "reward_std": 5.032071225783405, | |
| "rewards/RewardModelWrapper/mean": 0.4081239700317383, | |
| "rewards/RewardModelWrapper/std": 5.893623436198515, | |
| "step": 1100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022081555526237934, | |
| "clip_ratio/high_mean": 0.022081555526237934, | |
| "clip_ratio/low_mean": 0.015956819643906783, | |
| "clip_ratio/low_min": 0.015956819643906783, | |
| "clip_ratio/region_mean": 0.038038374953903255, | |
| "completions/clipped_ratio": 0.9365808823529411, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 95.0, | |
| "completions/mean_length": 125.22426470588235, | |
| "completions/mean_terminated_length": 73.54575303021599, | |
| "completions/min_length": 63.94117647058823, | |
| "completions/min_terminated_length": 48.88235294117647, | |
| "epoch": 0.25598219254312743, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.748858451843262, | |
| "kl": 0.42920990511775015, | |
| "learning_rate": 2.677327566583201e-06, | |
| "loss": 0.1117, | |
| "num_tokens": 3728050.0, | |
| "reward": 1.730754810221055, | |
| "reward_std": 4.819248423856847, | |
| "rewards/RewardModelWrapper/mean": 1.730754810221055, | |
| "rewards/RewardModelWrapper/std": 5.504547006943646, | |
| "step": 1150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02080470887827687, | |
| "clip_ratio/high_mean": 0.02080470887827687, | |
| "clip_ratio/low_mean": 0.01475024281651713, | |
| "clip_ratio/low_min": 0.01475024281651713, | |
| "clip_ratio/region_mean": 0.03555495172040537, | |
| "completions/clipped_ratio": 0.962890625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 98.625, | |
| "completions/mean_length": 126.416015625, | |
| "completions/mean_terminated_length": 81.55208349227905, | |
| "completions/min_length": 72.375, | |
| "completions/min_terminated_length": 64.375, | |
| "epoch": 0.2671118530884808, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.4883859157562256, | |
| "kl": 0.42720063477754594, | |
| "learning_rate": 2.6602549510585024e-06, | |
| "loss": 0.1195, | |
| "num_tokens": 3884876.0, | |
| "reward": 1.9971511512994766, | |
| "reward_std": 4.786640420556068, | |
| "rewards/RewardModelWrapper/mean": 1.9971511512994766, | |
| "rewards/RewardModelWrapper/std": 5.766968697309494, | |
| "step": 1200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023233274864032864, | |
| "clip_ratio/high_mean": 0.023233274864032864, | |
| "clip_ratio/low_mean": 0.01158983559376793, | |
| "clip_ratio/low_min": 0.01158983559376793, | |
| "clip_ratio/region_mean": 0.03482311038998887, | |
| "completions/clipped_ratio": 0.9347426470588235, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 114.17647058823529, | |
| "completions/mean_length": 125.17738970588235, | |
| "completions/mean_terminated_length": 89.91648954503677, | |
| "completions/min_length": 64.29411764705883, | |
| "completions/min_terminated_length": 64.29411764705883, | |
| "epoch": 0.27824151363383415, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.5916192531585693, | |
| "kl": 0.3887044958770275, | |
| "learning_rate": 2.643182335533804e-06, | |
| "loss": 0.1015, | |
| "num_tokens": 4050013.0, | |
| "reward": 0.8245974989498362, | |
| "reward_std": 5.001701130586512, | |
| "rewards/RewardModelWrapper/mean": 0.8245974989498362, | |
| "rewards/RewardModelWrapper/std": 5.80830400130328, | |
| "step": 1250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020529154643882067, | |
| "clip_ratio/high_mean": 0.020529154643882067, | |
| "clip_ratio/low_mean": 0.015356352158414665, | |
| "clip_ratio/low_min": 0.015356352158414665, | |
| "clip_ratio/region_mean": 0.03588550680316985, | |
| "completions/clipped_ratio": 0.9512867647058824, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 100.76470588235294, | |
| "completions/mean_length": 125.86305147058823, | |
| "completions/mean_terminated_length": 81.50539353314568, | |
| "completions/min_length": 63.470588235294116, | |
| "completions/min_terminated_length": 55.94117647058823, | |
| "epoch": 0.28937117417918756, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.05077600479126, | |
| "kl": 0.4404847612977028, | |
| "learning_rate": 2.6261097200091054e-06, | |
| "loss": 0.1208, | |
| "num_tokens": 4215832.0, | |
| "reward": 2.3118093013763428, | |
| "reward_std": 4.841920866685755, | |
| "rewards/RewardModelWrapper/mean": 2.3118093013763428, | |
| "rewards/RewardModelWrapper/std": 5.525171279907227, | |
| "step": 1300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024388792894314976, | |
| "clip_ratio/high_mean": 0.024388792894314976, | |
| "clip_ratio/low_mean": 0.015166401157330256, | |
| "clip_ratio/low_min": 0.015166401157330256, | |
| "clip_ratio/region_mean": 0.039555194084532556, | |
| "completions/clipped_ratio": 0.9345703125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 111.5, | |
| "completions/mean_length": 125.7822265625, | |
| "completions/mean_terminated_length": 88.69479322433472, | |
| "completions/min_length": 64.0625, | |
| "completions/min_terminated_length": 56.0625, | |
| "epoch": 0.3005008347245409, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.280036926269531, | |
| "kl": 0.4519739609956741, | |
| "learning_rate": 2.609037104484407e-06, | |
| "loss": 0.1237, | |
| "num_tokens": 4372361.0, | |
| "reward": 2.7925052791833878, | |
| "reward_std": 4.665284767746925, | |
| "rewards/RewardModelWrapper/mean": 2.7925052791833878, | |
| "rewards/RewardModelWrapper/std": 5.4118489027023315, | |
| "step": 1350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023544567436911166, | |
| "clip_ratio/high_mean": 0.023544567436911166, | |
| "clip_ratio/low_mean": 0.01299051069712732, | |
| "clip_ratio/low_min": 0.01299051069712732, | |
| "clip_ratio/region_mean": 0.03653507822658866, | |
| "completions/clipped_ratio": 0.9292279411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 111.82352941176471, | |
| "completions/mean_length": 124.65165441176471, | |
| "completions/mean_terminated_length": 82.68410469503964, | |
| "completions/min_length": 46.8235294117647, | |
| "completions/min_terminated_length": 46.8235294117647, | |
| "epoch": 0.3116304952698943, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.7401034832000732, | |
| "kl": 0.4922306627035141, | |
| "learning_rate": 2.591964488959709e-06, | |
| "loss": 0.1315, | |
| "num_tokens": 4537846.0, | |
| "reward": 2.862899471731747, | |
| "reward_std": 4.948802695554845, | |
| "rewards/RewardModelWrapper/mean": 2.862899471731747, | |
| "rewards/RewardModelWrapper/std": 5.503605421851663, | |
| "step": 1400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025161673842230812, | |
| "clip_ratio/high_mean": 0.025161673842230812, | |
| "clip_ratio/low_mean": 0.012126781771657989, | |
| "clip_ratio/low_min": 0.012126781771657989, | |
| "clip_ratio/region_mean": 0.037288455746602264, | |
| "completions/clipped_ratio": 0.9292279411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 117.17647058823529, | |
| "completions/mean_length": 125.05882352941177, | |
| "completions/mean_terminated_length": 90.7926357493681, | |
| "completions/min_length": 62.11764705882353, | |
| "completions/min_terminated_length": 62.11764705882353, | |
| "epoch": 0.32276015581524764, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.467922687530518, | |
| "kl": 0.46787314653396606, | |
| "learning_rate": 2.5748918734350105e-06, | |
| "loss": 0.1207, | |
| "num_tokens": 4702966.0, | |
| "reward": 1.3220273045932545, | |
| "reward_std": 4.946936158572926, | |
| "rewards/RewardModelWrapper/mean": 1.3220273045932545, | |
| "rewards/RewardModelWrapper/std": 5.816557715920841, | |
| "step": 1450 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025372368972748516, | |
| "clip_ratio/high_mean": 0.025372368972748516, | |
| "clip_ratio/low_mean": 0.01208616121119121, | |
| "clip_ratio/low_min": 0.01208616121119121, | |
| "clip_ratio/region_mean": 0.03745853026397526, | |
| "completions/clipped_ratio": 0.9599609375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 88.5625, | |
| "completions/mean_length": 126.3486328125, | |
| "completions/mean_terminated_length": 74.44687557220459, | |
| "completions/min_length": 73.875, | |
| "completions/min_terminated_length": 57.875, | |
| "epoch": 0.333889816360601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.3412301540374756, | |
| "kl": 0.5128468088805676, | |
| "learning_rate": 2.557819257910312e-06, | |
| "loss": 0.1397, | |
| "num_tokens": 4860219.0, | |
| "reward": 2.0575065165758133, | |
| "reward_std": 5.155221775174141, | |
| "rewards/RewardModelWrapper/mean": 2.0575065165758133, | |
| "rewards/RewardModelWrapper/std": 5.655524164438248, | |
| "step": 1500 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023876634621992708, | |
| "clip_ratio/high_mean": 0.023876634621992708, | |
| "clip_ratio/low_mean": 0.013062482952955179, | |
| "clip_ratio/low_min": 0.013062482952955179, | |
| "clip_ratio/region_mean": 0.03693911746609956, | |
| "completions/clipped_ratio": 0.9604779411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 84.47058823529412, | |
| "completions/mean_length": 126.6001838235294, | |
| "completions/mean_terminated_length": 73.35098131965188, | |
| "completions/min_length": 92.58823529411765, | |
| "completions/min_terminated_length": 62.470588235294116, | |
| "epoch": 0.34501947690595436, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.4934329986572266, | |
| "kl": 0.5125479310750961, | |
| "learning_rate": 2.5407466423856135e-06, | |
| "loss": 0.139, | |
| "num_tokens": 5027488.0, | |
| "reward": 2.7399597448461197, | |
| "reward_std": 4.745976616354549, | |
| "rewards/RewardModelWrapper/mean": 2.7399597448461197, | |
| "rewards/RewardModelWrapper/std": 5.300730144276338, | |
| "step": 1550 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02330939914332703, | |
| "clip_ratio/high_mean": 0.02330939914332703, | |
| "clip_ratio/low_mean": 0.009550860303861555, | |
| "clip_ratio/low_min": 0.009550860303861555, | |
| "clip_ratio/region_mean": 0.032860259409062564, | |
| "completions/clipped_ratio": 0.9641544117647058, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 95.70588235294117, | |
| "completions/mean_length": 126.61397058823529, | |
| "completions/mean_terminated_length": 81.39460844152114, | |
| "completions/min_length": 81.88235294117646, | |
| "completions/min_terminated_length": 66.82352941176471, | |
| "epoch": 0.3561491374513077, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.760233402252197, | |
| "kl": 0.5104251652956009, | |
| "learning_rate": 2.523674026860915e-06, | |
| "loss": 0.1396, | |
| "num_tokens": 5193996.0, | |
| "reward": 1.9389969741596895, | |
| "reward_std": 5.14070810991175, | |
| "rewards/RewardModelWrapper/mean": 1.9389969741596895, | |
| "rewards/RewardModelWrapper/std": 5.778058921589571, | |
| "step": 1600 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023669966620218474, | |
| "clip_ratio/high_mean": 0.023669966620218474, | |
| "clip_ratio/low_mean": 0.012192065346171147, | |
| "clip_ratio/low_min": 0.012192065346171147, | |
| "clip_ratio/region_mean": 0.035862031998112796, | |
| "completions/clipped_ratio": 0.9599609375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 105.625, | |
| "completions/mean_length": 126.4326171875, | |
| "completions/mean_terminated_length": 91.0947916507721, | |
| "completions/min_length": 77.5, | |
| "completions/min_terminated_length": 77.5, | |
| "epoch": 0.3672787979966611, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.496079206466675, | |
| "kl": 0.5115452679991722, | |
| "learning_rate": 2.5066014113362166e-06, | |
| "loss": 0.1421, | |
| "num_tokens": 5350823.0, | |
| "reward": 2.4139109551906586, | |
| "reward_std": 4.767535001039505, | |
| "rewards/RewardModelWrapper/mean": 2.4139109551906586, | |
| "rewards/RewardModelWrapper/std": 5.584080070257187, | |
| "step": 1650 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.026008948455564677, | |
| "clip_ratio/high_mean": 0.026008948455564677, | |
| "clip_ratio/low_mean": 0.008926556244841777, | |
| "clip_ratio/low_min": 0.008926556244841777, | |
| "clip_ratio/region_mean": 0.0349355046171695, | |
| "completions/clipped_ratio": 0.9604779411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 105.94117647058823, | |
| "completions/mean_length": 126.34926470588235, | |
| "completions/mean_terminated_length": 89.24902052037856, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "epoch": 0.3784084585420145, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.9608895778656006, | |
| "kl": 0.5186072036623954, | |
| "learning_rate": 2.489528795811518e-06, | |
| "loss": 0.1427, | |
| "num_tokens": 5517627.0, | |
| "reward": 1.1372435163049137, | |
| "reward_std": 5.190363294938031, | |
| "rewards/RewardModelWrapper/mean": 1.1372435163049137, | |
| "rewards/RewardModelWrapper/std": 5.777948155122645, | |
| "step": 1700 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022060031631262973, | |
| "clip_ratio/high_mean": 0.022060031631262973, | |
| "clip_ratio/low_mean": 0.012312272182898596, | |
| "clip_ratio/low_min": 0.012312272182898596, | |
| "clip_ratio/region_mean": 0.03437230377923697, | |
| "completions/clipped_ratio": 0.9632352941176471, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 102.82352941176471, | |
| "completions/mean_length": 126.49540441176471, | |
| "completions/mean_terminated_length": 88.83088302612305, | |
| "completions/min_length": 76.3529411764706, | |
| "completions/min_terminated_length": 68.82352941176471, | |
| "epoch": 0.38953811908736785, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.0580315589904785, | |
| "kl": 0.5839428542554379, | |
| "learning_rate": 2.4724561802868197e-06, | |
| "loss": 0.1614, | |
| "num_tokens": 5684102.0, | |
| "reward": 2.689769050654243, | |
| "reward_std": 4.625988932216869, | |
| "rewards/RewardModelWrapper/mean": 2.689769050654243, | |
| "rewards/RewardModelWrapper/std": 5.245194827809053, | |
| "step": 1750 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02739144684630446, | |
| "clip_ratio/high_mean": 0.02739144684630446, | |
| "clip_ratio/low_mean": 0.012341015862475616, | |
| "clip_ratio/low_min": 0.012341015862475616, | |
| "clip_ratio/region_mean": 0.03973246271605604, | |
| "completions/clipped_ratio": 0.95703125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 103.5625, | |
| "completions/mean_length": 126.060546875, | |
| "completions/mean_terminated_length": 83.71354246139526, | |
| "completions/min_length": 61.3125, | |
| "completions/min_terminated_length": 61.3125, | |
| "epoch": 0.4006677796327212, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.872232437133789, | |
| "kl": 0.5619081328809261, | |
| "learning_rate": 2.4553835647621216e-06, | |
| "loss": 0.1509, | |
| "num_tokens": 5840596.0, | |
| "reward": 2.3125159442424774, | |
| "reward_std": 4.9210382997989655, | |
| "rewards/RewardModelWrapper/mean": 2.3125159442424774, | |
| "rewards/RewardModelWrapper/std": 5.377374470233917, | |
| "step": 1800 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024034175912383944, | |
| "clip_ratio/high_mean": 0.024034175912383944, | |
| "clip_ratio/low_mean": 0.009776253007003107, | |
| "clip_ratio/low_min": 0.009776253007003107, | |
| "clip_ratio/region_mean": 0.03381042889552191, | |
| "completions/clipped_ratio": 0.9549632352941176, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 101.41176470588235, | |
| "completions/mean_length": 125.99724264705883, | |
| "completions/mean_terminated_length": 81.53921688304229, | |
| "completions/min_length": 65.23529411764706, | |
| "completions/min_terminated_length": 57.705882352941174, | |
| "epoch": 0.41179744017807457, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.8941831588745117, | |
| "kl": 0.615523195117712, | |
| "learning_rate": 2.4383109492374236e-06, | |
| "loss": 0.1677, | |
| "num_tokens": 6007113.0, | |
| "reward": 2.1543740524965176, | |
| "reward_std": 5.060079883126652, | |
| "rewards/RewardModelWrapper/mean": 2.1543740524965176, | |
| "rewards/RewardModelWrapper/std": 5.475296539418838, | |
| "step": 1850 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022410094959195704, | |
| "clip_ratio/high_mean": 0.022410094959195704, | |
| "clip_ratio/low_mean": 0.012868442094186321, | |
| "clip_ratio/low_min": 0.012868442094186321, | |
| "clip_ratio/region_mean": 0.035278537014964965, | |
| "completions/clipped_ratio": 0.9604779411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 91.94117647058823, | |
| "completions/mean_length": 126.30238970588235, | |
| "completions/mean_terminated_length": 79.55490246941062, | |
| "completions/min_length": 71.88235294117646, | |
| "completions/min_terminated_length": 64.3529411764706, | |
| "epoch": 0.42292710072342793, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.654233932495117, | |
| "kl": 0.6534902662038803, | |
| "learning_rate": 2.421238333712725e-06, | |
| "loss": 0.1843, | |
| "num_tokens": 6174378.0, | |
| "reward": 2.3925238006255207, | |
| "reward_std": 4.879058487275067, | |
| "rewards/RewardModelWrapper/mean": 2.3925238006255207, | |
| "rewards/RewardModelWrapper/std": 5.418860211091883, | |
| "step": 1900 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021921868621138856, | |
| "clip_ratio/high_mean": 0.021921868621138856, | |
| "clip_ratio/low_mean": 0.011612088698893786, | |
| "clip_ratio/low_min": 0.011612088698893786, | |
| "clip_ratio/region_mean": 0.03353395750047639, | |
| "completions/clipped_ratio": 0.94140625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 107.3125, | |
| "completions/mean_length": 125.32421875, | |
| "completions/mean_terminated_length": 82.90129089355469, | |
| "completions/min_length": 54.5, | |
| "completions/min_terminated_length": 54.5, | |
| "epoch": 0.4340567612687813, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.8473002910614014, | |
| "kl": 0.6599007929861546, | |
| "learning_rate": 2.4041657181880266e-06, | |
| "loss": 0.1769, | |
| "num_tokens": 6330166.0, | |
| "reward": 2.618502587080002, | |
| "reward_std": 4.749881863594055, | |
| "rewards/RewardModelWrapper/mean": 2.618502587080002, | |
| "rewards/RewardModelWrapper/std": 5.46898752450943, | |
| "step": 1950 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02155641552293673, | |
| "clip_ratio/high_mean": 0.02155641552293673, | |
| "clip_ratio/low_mean": 0.009289601502241568, | |
| "clip_ratio/low_min": 0.009289601502241568, | |
| "clip_ratio/region_mean": 0.030846016986761243, | |
| "completions/clipped_ratio": 0.9568014705882353, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 101.88235294117646, | |
| "completions/mean_length": 126.37040441176471, | |
| "completions/mean_terminated_length": 83.31176578297334, | |
| "completions/min_length": 72.88235294117646, | |
| "completions/min_terminated_length": 65.3529411764706, | |
| "epoch": 0.44518642181413465, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.8109655380249023, | |
| "kl": 0.7074493160843849, | |
| "learning_rate": 2.387093102663328e-06, | |
| "loss": 0.1974, | |
| "num_tokens": 6496761.0, | |
| "reward": 3.343541706309599, | |
| "reward_std": 4.7992883710300225, | |
| "rewards/RewardModelWrapper/mean": 3.343541706309599, | |
| "rewards/RewardModelWrapper/std": 5.472757451674518, | |
| "step": 2000 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.027084801244782283, | |
| "clip_ratio/high_mean": 0.027084801244782283, | |
| "clip_ratio/low_mean": 0.006253871699154843, | |
| "clip_ratio/low_min": 0.006253871699154843, | |
| "clip_ratio/region_mean": 0.03333867286099121, | |
| "completions/clipped_ratio": 0.9503676470588235, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 94.88235294117646, | |
| "completions/mean_length": 125.38786764705883, | |
| "completions/mean_terminated_length": 66.87544497321633, | |
| "completions/min_length": 58.1764705882353, | |
| "completions/min_terminated_length": 43.11764705882353, | |
| "epoch": 0.45631608235948806, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.87650203704834, | |
| "kl": 0.6578731602430343, | |
| "learning_rate": 2.3700204871386297e-06, | |
| "loss": 0.1804, | |
| "num_tokens": 6662615.0, | |
| "reward": 1.8048853032729204, | |
| "reward_std": 5.33220240649055, | |
| "rewards/RewardModelWrapper/mean": 1.8048853032729204, | |
| "rewards/RewardModelWrapper/std": 5.8003731334910675, | |
| "step": 2050 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022799394286703318, | |
| "clip_ratio/high_mean": 0.022799394286703318, | |
| "clip_ratio/low_mean": 0.008315351814671886, | |
| "clip_ratio/low_min": 0.008315351814671886, | |
| "clip_ratio/region_mean": 0.03111474617384374, | |
| "completions/clipped_ratio": 0.9609375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 87.0625, | |
| "completions/mean_length": 125.9619140625, | |
| "completions/mean_terminated_length": 69.97916746139526, | |
| "completions/min_length": 70.25, | |
| "completions/min_terminated_length": 54.25, | |
| "epoch": 0.4674457429048414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.432864189147949, | |
| "kl": 0.7079492492973805, | |
| "learning_rate": 2.3529478716139312e-06, | |
| "loss": 0.1956, | |
| "num_tokens": 6819392.0, | |
| "reward": 2.50741083920002, | |
| "reward_std": 5.116829484701157, | |
| "rewards/RewardModelWrapper/mean": 2.50741083920002, | |
| "rewards/RewardModelWrapper/std": 5.797209560871124, | |
| "step": 2100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022400263713207094, | |
| "clip_ratio/high_mean": 0.022400263713207094, | |
| "clip_ratio/low_mean": 0.008116541813942604, | |
| "clip_ratio/low_min": 0.008116541813942604, | |
| "clip_ratio/region_mean": 0.030516805413644762, | |
| "completions/clipped_ratio": 0.9347426470588235, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 100.88235294117646, | |
| "completions/mean_length": 125.04779411764706, | |
| "completions/mean_terminated_length": 75.39313866110409, | |
| "completions/min_length": 51.1764705882353, | |
| "completions/min_terminated_length": 43.64705882352941, | |
| "epoch": 0.4785754034501948, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.323696136474609, | |
| "kl": 0.7352806448936462, | |
| "learning_rate": 2.3358752560892328e-06, | |
| "loss": 0.1983, | |
| "num_tokens": 6984364.0, | |
| "reward": 2.347130256540635, | |
| "reward_std": 5.3196556708391975, | |
| "rewards/RewardModelWrapper/mean": 2.347130256540635, | |
| "rewards/RewardModelWrapper/std": 5.771211035111371, | |
| "step": 2150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021681377917993815, | |
| "clip_ratio/high_mean": 0.021681377917993815, | |
| "clip_ratio/low_mean": 0.01035779433674179, | |
| "clip_ratio/low_min": 0.01035779433674179, | |
| "clip_ratio/region_mean": 0.03203917214414105, | |
| "completions/clipped_ratio": 0.9466911764705882, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 109.29411764705883, | |
| "completions/mean_length": 125.82996323529412, | |
| "completions/mean_terminated_length": 90.63718593821807, | |
| "completions/min_length": 71.23529411764706, | |
| "completions/min_terminated_length": 71.23529411764706, | |
| "epoch": 0.48970506399554814, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.768919944763184, | |
| "kl": 0.7503913494944573, | |
| "learning_rate": 2.3188026405645343e-06, | |
| "loss": 0.2126, | |
| "num_tokens": 7150483.0, | |
| "reward": 2.530949129777796, | |
| "reward_std": 5.144188319935518, | |
| "rewards/RewardModelWrapper/mean": 2.530949129777796, | |
| "rewards/RewardModelWrapper/std": 5.636184664333568, | |
| "step": 2200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02238714267965406, | |
| "clip_ratio/high_mean": 0.02238714267965406, | |
| "clip_ratio/low_mean": 0.008641490781737957, | |
| "clip_ratio/low_min": 0.008641490781737957, | |
| "clip_ratio/region_mean": 0.031028633578680454, | |
| "completions/clipped_ratio": 0.927734375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 97.6875, | |
| "completions/mean_length": 125.126953125, | |
| "completions/mean_terminated_length": 79.61108827590942, | |
| "completions/min_length": 60.875, | |
| "completions/min_terminated_length": 52.875, | |
| "epoch": 0.5008347245409015, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.544048309326172, | |
| "kl": 0.8463270646333695, | |
| "learning_rate": 2.3017300250398363e-06, | |
| "loss": 0.234, | |
| "num_tokens": 7305749.0, | |
| "reward": 3.097047299146652, | |
| "reward_std": 5.260514736175537, | |
| "rewards/RewardModelWrapper/mean": 3.097047299146652, | |
| "rewards/RewardModelWrapper/std": 5.711855351924896, | |
| "step": 2250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0228908458375372, | |
| "clip_ratio/high_mean": 0.0228908458375372, | |
| "clip_ratio/low_mean": 0.009188006882905029, | |
| "clip_ratio/low_min": 0.009188006882905029, | |
| "clip_ratio/region_mean": 0.03207885263953358, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 95.58823529411765, | |
| "completions/mean_length": 126.76011029411765, | |
| "completions/mean_terminated_length": 83.82843219532685, | |
| "completions/min_length": 76.82352941176471, | |
| "completions/min_terminated_length": 69.29411764705883, | |
| "epoch": 0.5119643850862549, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.981594562530518, | |
| "kl": 0.8844366371631622, | |
| "learning_rate": 2.284657409515138e-06, | |
| "loss": 0.256, | |
| "num_tokens": 7472592.0, | |
| "reward": 3.071570908322054, | |
| "reward_std": 5.142256512361414, | |
| "rewards/RewardModelWrapper/mean": 3.071570908322054, | |
| "rewards/RewardModelWrapper/std": 5.772335641524371, | |
| "step": 2300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0240246270573698, | |
| "clip_ratio/high_mean": 0.0240246270573698, | |
| "clip_ratio/low_mean": 0.0069138467891025355, | |
| "clip_ratio/low_min": 0.0069138467891025355, | |
| "clip_ratio/region_mean": 0.03093847391428426, | |
| "completions/clipped_ratio": 0.9448529411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 104.76470588235294, | |
| "completions/mean_length": 125.68014705882354, | |
| "completions/mean_terminated_length": 84.08382460650276, | |
| "completions/min_length": 64.76470588235294, | |
| "completions/min_terminated_length": 57.23529411764706, | |
| "epoch": 0.5230940456316082, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.83440637588501, | |
| "kl": 0.9639204081892967, | |
| "learning_rate": 2.2675847939904393e-06, | |
| "loss": 0.2686, | |
| "num_tokens": 7637628.0, | |
| "reward": 2.8617815410389618, | |
| "reward_std": 5.505871576421401, | |
| "rewards/RewardModelWrapper/mean": 2.8617815410389618, | |
| "rewards/RewardModelWrapper/std": 5.944927496068618, | |
| "step": 2350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02327756991609931, | |
| "clip_ratio/high_mean": 0.02327756991609931, | |
| "clip_ratio/low_mean": 0.011412573783891275, | |
| "clip_ratio/low_min": 0.011412573783891275, | |
| "clip_ratio/region_mean": 0.03469014364061877, | |
| "completions/clipped_ratio": 0.9521484375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 112.875, | |
| "completions/mean_length": 126.125, | |
| "completions/mean_terminated_length": 91.77031326293945, | |
| "completions/min_length": 70.3125, | |
| "completions/min_terminated_length": 70.3125, | |
| "epoch": 0.5342237061769616, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.203379154205322, | |
| "kl": 1.0460551810264587, | |
| "learning_rate": 2.250512178465741e-06, | |
| "loss": 0.299, | |
| "num_tokens": 7793988.0, | |
| "reward": 3.6445817947387695, | |
| "reward_std": 5.2445206344127655, | |
| "rewards/RewardModelWrapper/mean": 3.6445817947387695, | |
| "rewards/RewardModelWrapper/std": 5.754371851682663, | |
| "step": 2400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025683601254131647, | |
| "clip_ratio/high_mean": 0.025683601254131647, | |
| "clip_ratio/low_mean": 0.007094714913982898, | |
| "clip_ratio/low_min": 0.007094714913982898, | |
| "clip_ratio/region_mean": 0.032778316254261884, | |
| "completions/clipped_ratio": 0.9310661764705882, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 113.58823529411765, | |
| "completions/mean_length": 125.19117647058823, | |
| "completions/mean_terminated_length": 87.4491610807531, | |
| "completions/min_length": 52.588235294117645, | |
| "completions/min_terminated_length": 52.588235294117645, | |
| "epoch": 0.5453533667223149, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.791234970092773, | |
| "kl": 1.0378785887360573, | |
| "learning_rate": 2.233439562941043e-06, | |
| "loss": 0.2908, | |
| "num_tokens": 7958828.0, | |
| "reward": 2.4743111414067886, | |
| "reward_std": 5.666090853074017, | |
| "rewards/RewardModelWrapper/mean": 2.4743111414067886, | |
| "rewards/RewardModelWrapper/std": 6.052795522353229, | |
| "step": 2450 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.022869902374222876, | |
| "clip_ratio/high_mean": 0.022869902374222876, | |
| "clip_ratio/low_mean": 0.010338929877325426, | |
| "clip_ratio/low_min": 0.010338929877325426, | |
| "clip_ratio/region_mean": 0.03320883221458644, | |
| "completions/clipped_ratio": 0.9448529411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 106.6470588235294, | |
| "completions/mean_length": 125.87040441176471, | |
| "completions/mean_terminated_length": 87.59656883688534, | |
| "completions/min_length": 75.76470588235294, | |
| "completions/min_terminated_length": 68.23529411764706, | |
| "epoch": 0.5564830272676683, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.271297931671143, | |
| "kl": 1.135116419494152, | |
| "learning_rate": 2.2163669474163444e-06, | |
| "loss": 0.3229, | |
| "num_tokens": 8125183.0, | |
| "reward": 2.681288887472714, | |
| "reward_std": 5.512399000280044, | |
| "rewards/RewardModelWrapper/mean": 2.681288887472714, | |
| "rewards/RewardModelWrapper/std": 6.263462291044347, | |
| "step": 2500 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024373745566699655, | |
| "clip_ratio/high_mean": 0.024373745566699655, | |
| "clip_ratio/low_mean": 0.007875631948991213, | |
| "clip_ratio/low_min": 0.007875631948991213, | |
| "clip_ratio/region_mean": 0.032249377460684625, | |
| "completions/clipped_ratio": 0.962890625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 99.3125, | |
| "completions/mean_length": 126.6416015625, | |
| "completions/mean_terminated_length": 84.72916746139526, | |
| "completions/min_length": 75.75, | |
| "completions/min_terminated_length": 67.75, | |
| "epoch": 0.5676126878130217, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.9515485763549805, | |
| "kl": 1.1067718014121055, | |
| "learning_rate": 2.199294331891646e-06, | |
| "loss": 0.3174, | |
| "num_tokens": 8282648.0, | |
| "reward": 2.5410157814621925, | |
| "reward_std": 5.60416579246521, | |
| "rewards/RewardModelWrapper/mean": 2.5410157814621925, | |
| "rewards/RewardModelWrapper/std": 6.249917358160019, | |
| "step": 2550 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021070915756281464, | |
| "clip_ratio/high_mean": 0.021070915756281464, | |
| "clip_ratio/low_mean": 0.010609990251832641, | |
| "clip_ratio/low_min": 0.010609990251832641, | |
| "clip_ratio/region_mean": 0.03168090590508655, | |
| "completions/clipped_ratio": 0.9613970588235294, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 89.23529411764706, | |
| "completions/mean_length": 126.42463235294117, | |
| "completions/mean_terminated_length": 78.3034320158117, | |
| "completions/min_length": 81.52941176470588, | |
| "completions/min_terminated_length": 66.47058823529412, | |
| "epoch": 0.5787423483583751, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.162291049957275, | |
| "kl": 1.2047522097826004, | |
| "learning_rate": 2.1822217163669474e-06, | |
| "loss": 0.3462, | |
| "num_tokens": 8449478.0, | |
| "reward": 3.0924135095932903, | |
| "reward_std": 5.470459377064424, | |
| "rewards/RewardModelWrapper/mean": 3.0924135095932903, | |
| "rewards/RewardModelWrapper/std": 6.024646282196045, | |
| "step": 2600 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02261253957170993, | |
| "clip_ratio/high_mean": 0.02261253957170993, | |
| "clip_ratio/low_mean": 0.008833104789373466, | |
| "clip_ratio/low_min": 0.008833104789373466, | |
| "clip_ratio/region_mean": 0.0314456443907693, | |
| "completions/clipped_ratio": 0.9494485294117647, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 94.29411764705883, | |
| "completions/mean_length": 125.73161764705883, | |
| "completions/mean_terminated_length": 72.89117723352769, | |
| "completions/min_length": 66.52941176470588, | |
| "completions/min_terminated_length": 51.470588235294116, | |
| "epoch": 0.5898720089037285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.318300247192383, | |
| "kl": 1.2395999401807785, | |
| "learning_rate": 2.165149100842249e-06, | |
| "loss": 0.3561, | |
| "num_tokens": 8615194.0, | |
| "reward": 2.5635701067307415, | |
| "reward_std": 5.7780221490299, | |
| "rewards/RewardModelWrapper/mean": 2.5635701067307415, | |
| "rewards/RewardModelWrapper/std": 6.476823947008918, | |
| "step": 2650 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02310706490650773, | |
| "clip_ratio/high_mean": 0.02310706490650773, | |
| "clip_ratio/low_mean": 0.008465991305129136, | |
| "clip_ratio/low_min": 0.008465991305129136, | |
| "clip_ratio/region_mean": 0.03157305620610714, | |
| "completions/clipped_ratio": 0.9462890625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 107.625, | |
| "completions/mean_length": 125.8720703125, | |
| "completions/mean_terminated_length": 88.43675756454468, | |
| "completions/min_length": 65.875, | |
| "completions/min_terminated_length": 65.875, | |
| "epoch": 0.6010016694490818, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.8498454093933105, | |
| "kl": 1.2743730303645133, | |
| "learning_rate": 2.148076485317551e-06, | |
| "loss": 0.3642, | |
| "num_tokens": 8771759.0, | |
| "reward": 3.0489635169506073, | |
| "reward_std": 5.676127910614014, | |
| "rewards/RewardModelWrapper/mean": 3.0489635169506073, | |
| "rewards/RewardModelWrapper/std": 6.18413832783699, | |
| "step": 2700 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017379222289891912, | |
| "clip_ratio/high_mean": 0.017379222289891912, | |
| "clip_ratio/low_mean": 0.012123786294832826, | |
| "clip_ratio/low_min": 0.012123786294832826, | |
| "clip_ratio/region_mean": 0.029503008612664416, | |
| "completions/clipped_ratio": 0.9613970588235294, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 87.82352941176471, | |
| "completions/mean_length": 126.33823529411765, | |
| "completions/mean_terminated_length": 70.85490282844094, | |
| "completions/min_length": 75.94117647058823, | |
| "completions/min_terminated_length": 53.35294117647059, | |
| "epoch": 0.6121313299944352, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.328779697418213, | |
| "kl": 1.5358505266904832, | |
| "learning_rate": 2.1320282267243345e-06, | |
| "loss": 0.44, | |
| "num_tokens": 8938623.0, | |
| "reward": 4.035378414041856, | |
| "reward_std": 5.26687082122354, | |
| "rewards/RewardModelWrapper/mean": 4.035378414041856, | |
| "rewards/RewardModelWrapper/std": 6.010923722211053, | |
| "step": 2750 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02342768482863903, | |
| "clip_ratio/high_mean": 0.02342768482863903, | |
| "clip_ratio/low_mean": 0.007425281075702514, | |
| "clip_ratio/low_min": 0.007425281075702514, | |
| "clip_ratio/region_mean": 0.03085296612116508, | |
| "completions/clipped_ratio": 0.9476102941176471, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 102.82352941176471, | |
| "completions/mean_length": 125.83272058823529, | |
| "completions/mean_terminated_length": 83.69166744456572, | |
| "completions/min_length": 70.94117647058823, | |
| "completions/min_terminated_length": 63.411764705882355, | |
| "epoch": 0.6232609905397886, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 13.94502067565918, | |
| "kl": 1.3090015414357186, | |
| "learning_rate": 2.114955611199636e-06, | |
| "loss": 0.3749, | |
| "num_tokens": 9104641.0, | |
| "reward": 3.50168057049022, | |
| "reward_std": 5.636927548576804, | |
| "rewards/RewardModelWrapper/mean": 3.50168057049022, | |
| "rewards/RewardModelWrapper/std": 6.223201779758229, | |
| "step": 2800 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.023096702507464217, | |
| "clip_ratio/high_mean": 0.023096702507464217, | |
| "clip_ratio/low_mean": 0.01079344226163812, | |
| "clip_ratio/low_min": 0.01079344226163812, | |
| "clip_ratio/region_mean": 0.033890144524630156, | |
| "completions/clipped_ratio": 0.947265625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 111.125, | |
| "completions/mean_length": 126.0732421875, | |
| "completions/mean_terminated_length": 91.28541803359985, | |
| "completions/min_length": 68.25, | |
| "completions/min_terminated_length": 68.25, | |
| "epoch": 0.6343906510851419, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.740921974182129, | |
| "kl": 1.2168986845016478, | |
| "learning_rate": 2.0978829956749376e-06, | |
| "loss": 0.3468, | |
| "num_tokens": 9260988.0, | |
| "reward": 2.903833270072937, | |
| "reward_std": 5.634722024202347, | |
| "rewards/RewardModelWrapper/mean": 2.903833270072937, | |
| "rewards/RewardModelWrapper/std": 6.182769417762756, | |
| "step": 2850 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021131394968833775, | |
| "clip_ratio/high_mean": 0.021131394968833775, | |
| "clip_ratio/low_mean": 0.00905259191960795, | |
| "clip_ratio/low_min": 0.00905259191960795, | |
| "clip_ratio/region_mean": 0.03018398679094389, | |
| "completions/clipped_ratio": 0.9549632352941176, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 96.94117647058823, | |
| "completions/mean_length": 125.89613970588235, | |
| "completions/mean_terminated_length": 78.88333488913143, | |
| "completions/min_length": 67.82352941176471, | |
| "completions/min_terminated_length": 60.294117647058826, | |
| "epoch": 0.6455203116304953, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.610988616943359, | |
| "kl": 1.332914224267006, | |
| "learning_rate": 2.080810380150239e-06, | |
| "loss": 0.3851, | |
| "num_tokens": 9427003.0, | |
| "reward": 3.4099216741674088, | |
| "reward_std": 5.599381278542912, | |
| "rewards/RewardModelWrapper/mean": 3.4099216741674088, | |
| "rewards/RewardModelWrapper/std": 6.283486815059886, | |
| "step": 2900 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.024977084384299814, | |
| "clip_ratio/high_mean": 0.024977084384299814, | |
| "clip_ratio/low_mean": 0.009850850635266396, | |
| "clip_ratio/low_min": 0.009850850635266396, | |
| "clip_ratio/region_mean": 0.034827935132198035, | |
| "completions/clipped_ratio": 0.9430147058823529, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 105.11764705882354, | |
| "completions/mean_length": 125.54503676470588, | |
| "completions/mean_terminated_length": 85.04131810805377, | |
| "completions/min_length": 64.41176470588235, | |
| "completions/min_terminated_length": 64.41176470588235, | |
| "epoch": 0.6566499721758486, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.673405170440674, | |
| "kl": 1.34111887216568, | |
| "learning_rate": 2.0637377646255406e-06, | |
| "loss": 0.3787, | |
| "num_tokens": 9592884.0, | |
| "reward": 3.767064431134392, | |
| "reward_std": 5.628603626700008, | |
| "rewards/RewardModelWrapper/mean": 3.767064431134392, | |
| "rewards/RewardModelWrapper/std": 6.238466964048498, | |
| "step": 2950 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.019235485673416406, | |
| "clip_ratio/high_mean": 0.019235485673416406, | |
| "clip_ratio/low_mean": 0.008951259328168816, | |
| "clip_ratio/low_min": 0.008951259328168816, | |
| "clip_ratio/region_mean": 0.02818674497772008, | |
| "completions/clipped_ratio": 0.958984375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 93.625, | |
| "completions/mean_length": 126.48046875, | |
| "completions/mean_terminated_length": 79.1166672706604, | |
| "completions/min_length": 76.125, | |
| "completions/min_terminated_length": 60.125, | |
| "epoch": 0.667779632721202, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": Infinity, | |
| "kl": 1.6551162710785865, | |
| "learning_rate": 2.0470066014113363e-06, | |
| "loss": 0.4809, | |
| "num_tokens": 9750288.0, | |
| "reward": 3.3632944226264954, | |
| "reward_std": 5.644728451967239, | |
| "rewards/RewardModelWrapper/mean": 3.3632944226264954, | |
| "rewards/RewardModelWrapper/std": 6.475361466407776, | |
| "step": 3000 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021347561194561424, | |
| "clip_ratio/high_mean": 0.021347561194561424, | |
| "clip_ratio/low_mean": 0.012039180095889605, | |
| "clip_ratio/low_min": 0.012039180095889605, | |
| "clip_ratio/region_mean": 0.03338674116646871, | |
| "completions/clipped_ratio": 0.9641544117647058, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 84.29411764705883, | |
| "completions/mean_length": 126.47426470588235, | |
| "completions/mean_terminated_length": 67.78921643425437, | |
| "completions/min_length": 73.29411764705883, | |
| "completions/min_terminated_length": 50.705882352941174, | |
| "epoch": 0.6789092932665554, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.413740158081055, | |
| "kl": 1.3862884595990181, | |
| "learning_rate": 2.030275438197132e-06, | |
| "loss": 0.4017, | |
| "num_tokens": 9917180.0, | |
| "reward": 3.722391970017377, | |
| "reward_std": 5.822299059699564, | |
| "rewards/RewardModelWrapper/mean": 3.722391970017377, | |
| "rewards/RewardModelWrapper/std": 6.463091822231517, | |
| "step": 3050 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018999405660433694, | |
| "clip_ratio/high_mean": 0.018999405660433694, | |
| "clip_ratio/low_mean": 0.010441597908793484, | |
| "clip_ratio/low_min": 0.010441597908793484, | |
| "clip_ratio/region_mean": 0.029441003524698316, | |
| "completions/clipped_ratio": 0.9586397058823529, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 112.29411764705883, | |
| "completions/mean_length": 126.4623161764706, | |
| "completions/mean_terminated_length": 90.0686279745663, | |
| "completions/min_length": 63.94117647058823, | |
| "completions/min_terminated_length": 63.94117647058823, | |
| "epoch": 0.6900389538119087, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 11.30611801147461, | |
| "kl": 1.3920823442935943, | |
| "learning_rate": 2.0132028226724335e-06, | |
| "loss": 0.4035, | |
| "num_tokens": 10083867.0, | |
| "reward": 3.71955924875596, | |
| "reward_std": 5.790389762205236, | |
| "rewards/RewardModelWrapper/mean": 3.71955924875596, | |
| "rewards/RewardModelWrapper/std": 6.5407993653241325, | |
| "step": 3100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.02239516925183125, | |
| "clip_ratio/high_mean": 0.02239516925183125, | |
| "clip_ratio/low_mean": 0.010940310020523612, | |
| "clip_ratio/low_min": 0.010940310020523612, | |
| "clip_ratio/region_mean": 0.03333547928952612, | |
| "completions/clipped_ratio": 0.955078125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 99.875, | |
| "completions/mean_length": 126.2353515625, | |
| "completions/mean_terminated_length": 82.07812547683716, | |
| "completions/min_length": 70.75, | |
| "completions/min_terminated_length": 62.75, | |
| "epoch": 0.7011686143572621, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 8.098145484924316, | |
| "kl": 1.332285776436329, | |
| "learning_rate": 1.996130207147735e-06, | |
| "loss": 0.3818, | |
| "num_tokens": 10240948.0, | |
| "reward": 3.675293631851673, | |
| "reward_std": 5.620851904153824, | |
| "rewards/RewardModelWrapper/mean": 3.675293631851673, | |
| "rewards/RewardModelWrapper/std": 6.339143455028534, | |
| "step": 3150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017545219952007755, | |
| "clip_ratio/high_mean": 0.017545219952007755, | |
| "clip_ratio/low_mean": 0.006160206313361414, | |
| "clip_ratio/low_min": 0.006160206313361414, | |
| "clip_ratio/region_mean": 0.023705426228698343, | |
| "completions/clipped_ratio": 0.9540441176470589, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 105.05882352941177, | |
| "completions/mean_length": 126.03216911764706, | |
| "completions/mean_terminated_length": 81.38235316557042, | |
| "completions/min_length": 66.52941176470588, | |
| "completions/min_terminated_length": 59.0, | |
| "epoch": 0.7122982749026154, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.446596622467041, | |
| "kl": 1.3602485132217408, | |
| "learning_rate": 1.9790575916230366e-06, | |
| "loss": 0.3915, | |
| "num_tokens": 10407047.0, | |
| "reward": 3.461222396177404, | |
| "reward_std": 5.5388546831467576, | |
| "rewards/RewardModelWrapper/mean": 3.461222396177404, | |
| "rewards/RewardModelWrapper/std": 6.420014409457936, | |
| "step": 3200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01795817382866517, | |
| "clip_ratio/high_mean": 0.01795817382866517, | |
| "clip_ratio/low_mean": 0.008432389081281143, | |
| "clip_ratio/low_min": 0.008432389081281143, | |
| "clip_ratio/region_mean": 0.026390562802553176, | |
| "completions/clipped_ratio": 0.9669117647058824, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 78.94117647058823, | |
| "completions/mean_length": 126.28033088235294, | |
| "completions/mean_terminated_length": 61.84313740449793, | |
| "completions/min_length": 68.05882352941177, | |
| "completions/min_terminated_length": 45.470588235294116, | |
| "epoch": 0.7234279354479688, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 17.92909812927246, | |
| "kl": 1.4322891801595687, | |
| "learning_rate": 1.9619849760983386e-06, | |
| "loss": 0.4131, | |
| "num_tokens": 10573736.0, | |
| "reward": 3.6829915467430565, | |
| "reward_std": 5.790671881507425, | |
| "rewards/RewardModelWrapper/mean": 3.6829915467430565, | |
| "rewards/RewardModelWrapper/std": 6.5448582032147575, | |
| "step": 3250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01961003711214289, | |
| "clip_ratio/high_mean": 0.01961003711214289, | |
| "clip_ratio/low_mean": 0.010123618032957893, | |
| "clip_ratio/low_min": 0.010123618032957893, | |
| "clip_ratio/region_mean": 0.02973365513375029, | |
| "completions/clipped_ratio": 0.9736328125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 89.75, | |
| "completions/mean_length": 127.12890625, | |
| "completions/mean_terminated_length": 78.04687547683716, | |
| "completions/min_length": 88.3125, | |
| "completions/min_terminated_length": 64.3125, | |
| "epoch": 0.7345575959933222, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.4893033504486084, | |
| "kl": 1.3992696887254714, | |
| "learning_rate": 1.94491236057364e-06, | |
| "loss": 0.407, | |
| "num_tokens": 10732252.0, | |
| "reward": 4.159975051879883, | |
| "reward_std": 5.596900701522827, | |
| "rewards/RewardModelWrapper/mean": 4.159975051879883, | |
| "rewards/RewardModelWrapper/std": 6.406121611595154, | |
| "step": 3300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01751380935544148, | |
| "clip_ratio/high_mean": 0.01751380935544148, | |
| "clip_ratio/low_mean": 0.006701366908382625, | |
| "clip_ratio/low_min": 0.006701366908382625, | |
| "clip_ratio/region_mean": 0.02421517624054104, | |
| "completions/clipped_ratio": 0.9632352941176471, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 89.11764705882354, | |
| "completions/mean_length": 126.52113970588235, | |
| "completions/mean_terminated_length": 75.78823538387523, | |
| "completions/min_length": 73.58823529411765, | |
| "completions/min_terminated_length": 58.529411764705884, | |
| "epoch": 0.7456872565386756, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.024867057800293, | |
| "kl": 1.48705244243145, | |
| "learning_rate": 1.9278397450489416e-06, | |
| "loss": 0.4302, | |
| "num_tokens": 10898899.0, | |
| "reward": 4.022455299601836, | |
| "reward_std": 6.009893417358398, | |
| "rewards/RewardModelWrapper/mean": 4.022455299601836, | |
| "rewards/RewardModelWrapper/std": 6.55277754278744, | |
| "step": 3350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020205343069974332, | |
| "clip_ratio/high_mean": 0.020205343069974332, | |
| "clip_ratio/low_mean": 0.008244332130707334, | |
| "clip_ratio/low_min": 0.008244332130707334, | |
| "clip_ratio/region_mean": 0.028449675207957624, | |
| "completions/clipped_ratio": 0.953125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 83.52941176470588, | |
| "completions/mean_length": 125.67463235294117, | |
| "completions/mean_terminated_length": 65.16414619894589, | |
| "completions/min_length": 67.76470588235294, | |
| "completions/min_terminated_length": 45.1764705882353, | |
| "epoch": 0.756816917084029, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.548982620239258, | |
| "kl": 1.4358280056715012, | |
| "learning_rate": 1.910767129524243e-06, | |
| "loss": 0.4127, | |
| "num_tokens": 11065097.0, | |
| "reward": 3.6614036700304817, | |
| "reward_std": 5.941182669471292, | |
| "rewards/RewardModelWrapper/mean": 3.6614036700304817, | |
| "rewards/RewardModelWrapper/std": 6.68101375243243, | |
| "step": 3400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018692465843632818, | |
| "clip_ratio/high_mean": 0.018692465843632818, | |
| "clip_ratio/low_mean": 0.008573709986812901, | |
| "clip_ratio/low_min": 0.008573709986812901, | |
| "clip_ratio/region_mean": 0.02726617576321587, | |
| "completions/clipped_ratio": 0.9541015625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 98.8125, | |
| "completions/mean_length": 126.1083984375, | |
| "completions/mean_terminated_length": 77.7172622680664, | |
| "completions/min_length": 63.125, | |
| "completions/min_terminated_length": 55.125, | |
| "epoch": 0.7679465776293823, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.6161601543426514, | |
| "kl": 1.4051894819736481, | |
| "learning_rate": 1.8936945139995447e-06, | |
| "loss": 0.4055, | |
| "num_tokens": 11221336.0, | |
| "reward": 2.737824946641922, | |
| "reward_std": 6.139679282903671, | |
| "rewards/RewardModelWrapper/mean": 2.737824946641922, | |
| "rewards/RewardModelWrapper/std": 6.881059348583221, | |
| "step": 3450 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.019909201117698103, | |
| "clip_ratio/high_mean": 0.019909201117698103, | |
| "clip_ratio/low_mean": 0.009944785697734914, | |
| "clip_ratio/low_min": 0.009944785697734914, | |
| "clip_ratio/region_mean": 0.029853986804373563, | |
| "completions/clipped_ratio": 0.9733455882352942, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 91.88235294117646, | |
| "completions/mean_length": 127.20036764705883, | |
| "completions/mean_terminated_length": 86.0049025591682, | |
| "completions/min_length": 94.52941176470588, | |
| "completions/min_terminated_length": 79.47058823529412, | |
| "epoch": 0.7790762381747357, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.78251314163208, | |
| "kl": 1.4502297604084016, | |
| "learning_rate": 1.8766218984748462e-06, | |
| "loss": 0.4266, | |
| "num_tokens": 11389018.0, | |
| "reward": 4.499273047727697, | |
| "reward_std": 5.489500326268813, | |
| "rewards/RewardModelWrapper/mean": 4.499273047727697, | |
| "rewards/RewardModelWrapper/std": 6.2598629839280076, | |
| "step": 3500 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01706919132906478, | |
| "clip_ratio/high_mean": 0.01706919132906478, | |
| "clip_ratio/low_mean": 0.007432717043848243, | |
| "clip_ratio/low_min": 0.007432717043848243, | |
| "clip_ratio/region_mean": 0.024501908438978717, | |
| "completions/clipped_ratio": 0.9568014705882353, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 91.47058823529412, | |
| "completions/mean_length": 126.015625, | |
| "completions/mean_terminated_length": 74.68823646096622, | |
| "completions/min_length": 61.76470588235294, | |
| "completions/min_terminated_length": 54.23529411764706, | |
| "epoch": 0.7902058987200891, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.431344509124756, | |
| "kl": 1.4172208327054978, | |
| "learning_rate": 1.859549282950148e-06, | |
| "loss": 0.4053, | |
| "num_tokens": 11555371.0, | |
| "reward": 3.9203204547657684, | |
| "reward_std": 5.879987856921027, | |
| "rewards/RewardModelWrapper/mean": 3.9203204547657684, | |
| "rewards/RewardModelWrapper/std": 6.654794917387121, | |
| "step": 3550 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017978638106724246, | |
| "clip_ratio/high_mean": 0.017978638106724246, | |
| "clip_ratio/low_mean": 0.008542120530910325, | |
| "clip_ratio/low_min": 0.008542120530910325, | |
| "clip_ratio/region_mean": 0.02652075860532932, | |
| "completions/clipped_ratio": 0.95703125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 105.0625, | |
| "completions/mean_length": 126.1298828125, | |
| "completions/mean_terminated_length": 88.37500047683716, | |
| "completions/min_length": 68.9375, | |
| "completions/min_terminated_length": 68.9375, | |
| "epoch": 0.8013355592654424, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.334597110748291, | |
| "kl": 1.357377045750618, | |
| "learning_rate": 1.8424766674254495e-06, | |
| "loss": 0.39, | |
| "num_tokens": 11712544.0, | |
| "reward": 3.334804505109787, | |
| "reward_std": 6.004520118236542, | |
| "rewards/RewardModelWrapper/mean": 3.334804505109787, | |
| "rewards/RewardModelWrapper/std": 6.608620345592499, | |
| "step": 3600 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01815531796310097, | |
| "clip_ratio/high_mean": 0.01815531796310097, | |
| "clip_ratio/low_mean": 0.00551853927434422, | |
| "clip_ratio/low_min": 0.00551853927434422, | |
| "clip_ratio/region_mean": 0.023673857206013053, | |
| "completions/clipped_ratio": 0.9586397058823529, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 106.0, | |
| "completions/mean_length": 126.57444852941177, | |
| "completions/mean_terminated_length": 91.43627570657169, | |
| "completions/min_length": 73.05882352941177, | |
| "completions/min_terminated_length": 73.05882352941177, | |
| "epoch": 0.8124652198107958, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.32098913192749, | |
| "kl": 1.3787575218081474, | |
| "learning_rate": 1.825404051900751e-06, | |
| "loss": 0.406, | |
| "num_tokens": 11879329.0, | |
| "reward": 4.733595371246338, | |
| "reward_std": 5.286141087027157, | |
| "rewards/RewardModelWrapper/mean": 4.733595371246338, | |
| "rewards/RewardModelWrapper/std": 6.089175813338336, | |
| "step": 3650 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018292159989941867, | |
| "clip_ratio/high_mean": 0.018292159989941867, | |
| "clip_ratio/low_mean": 0.00964461057272274, | |
| "clip_ratio/low_min": 0.00964461057272274, | |
| "clip_ratio/region_mean": 0.027936770617961883, | |
| "completions/clipped_ratio": 0.9347426470588235, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 100.94117647058823, | |
| "completions/mean_length": 125.18014705882354, | |
| "completions/mean_terminated_length": 79.63531673655791, | |
| "completions/min_length": 62.8235294117647, | |
| "completions/min_terminated_length": 55.294117647058826, | |
| "epoch": 0.8235948803561491, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 11.81167984008789, | |
| "kl": 1.3568527114391327, | |
| "learning_rate": 1.8083314363760528e-06, | |
| "loss": 0.3856, | |
| "num_tokens": 12044285.0, | |
| "reward": 3.853144645690918, | |
| "reward_std": 5.8185105744530174, | |
| "rewards/RewardModelWrapper/mean": 3.853144645690918, | |
| "rewards/RewardModelWrapper/std": 6.648196416742661, | |
| "step": 3700 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020421573969069868, | |
| "clip_ratio/high_mean": 0.020421573969069868, | |
| "clip_ratio/low_mean": 0.006358395353017841, | |
| "clip_ratio/low_min": 0.006358395353017841, | |
| "clip_ratio/region_mean": 0.02677996931830421, | |
| "completions/clipped_ratio": 0.966796875, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 85.4375, | |
| "completions/mean_length": 126.6142578125, | |
| "completions/mean_terminated_length": 75.74791765213013, | |
| "completions/min_length": 65.0, | |
| "completions/min_terminated_length": 65.0, | |
| "epoch": 0.8347245409015025, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.919222593307495, | |
| "kl": 1.4361850446462632, | |
| "learning_rate": 1.7912588208513545e-06, | |
| "loss": 0.4195, | |
| "num_tokens": 12201530.0, | |
| "reward": 4.583240419626236, | |
| "reward_std": 5.661596119403839, | |
| "rewards/RewardModelWrapper/mean": 4.583240419626236, | |
| "rewards/RewardModelWrapper/std": 6.355997741222382, | |
| "step": 3750 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01899803020292893, | |
| "clip_ratio/high_mean": 0.01899803020292893, | |
| "clip_ratio/low_mean": 0.005853212493821047, | |
| "clip_ratio/low_min": 0.005853212493821047, | |
| "clip_ratio/region_mean": 0.02485124268569052, | |
| "completions/clipped_ratio": 0.9485294117647058, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 106.41176470588235, | |
| "completions/mean_length": 125.8529411764706, | |
| "completions/mean_terminated_length": 86.47465066348805, | |
| "completions/min_length": 63.11764705882353, | |
| "completions/min_terminated_length": 63.11764705882353, | |
| "epoch": 0.8458542014468559, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.776216983795166, | |
| "kl": 1.4694108253717422, | |
| "learning_rate": 1.7741862053266563e-06, | |
| "loss": 0.4258, | |
| "num_tokens": 12367226.0, | |
| "reward": 4.691084188573501, | |
| "reward_std": 5.392301559448242, | |
| "rewards/RewardModelWrapper/mean": 4.691084188573501, | |
| "rewards/RewardModelWrapper/std": 6.076854313121123, | |
| "step": 3800 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.020900118886493145, | |
| "clip_ratio/high_mean": 0.020900118886493145, | |
| "clip_ratio/low_mean": 0.008081750934943557, | |
| "clip_ratio/low_min": 0.008081750934943557, | |
| "clip_ratio/region_mean": 0.028981869909912347, | |
| "completions/clipped_ratio": 0.9733455882352942, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 76.76470588235294, | |
| "completions/mean_length": 126.63051470588235, | |
| "completions/mean_terminated_length": 62.98235298605526, | |
| "completions/min_length": 81.41176470588235, | |
| "completions/min_terminated_length": 51.294117647058826, | |
| "epoch": 0.8569838619922092, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.310102462768555, | |
| "kl": 1.3774869224429132, | |
| "learning_rate": 1.7571135898019578e-06, | |
| "loss": 0.398, | |
| "num_tokens": 12534040.0, | |
| "reward": 3.871620360542746, | |
| "reward_std": 5.696767147849588, | |
| "rewards/RewardModelWrapper/mean": 3.871620360542746, | |
| "rewards/RewardModelWrapper/std": 6.582426996792064, | |
| "step": 3850 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.021299479028675704, | |
| "clip_ratio/high_mean": 0.021299479028675704, | |
| "clip_ratio/low_mean": 0.0075305427008424885, | |
| "clip_ratio/low_min": 0.0075305427008424885, | |
| "clip_ratio/region_mean": 0.028830021731555463, | |
| "completions/clipped_ratio": 0.9619140625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 85.8125, | |
| "completions/mean_length": 126.1943359375, | |
| "completions/mean_terminated_length": 71.39270901679993, | |
| "completions/min_length": 71.8125, | |
| "completions/min_terminated_length": 55.8125, | |
| "epoch": 0.8681135225375626, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.930108547210693, | |
| "kl": 1.3843135032057763, | |
| "learning_rate": 1.7400409742772593e-06, | |
| "loss": 0.3964, | |
| "num_tokens": 12690615.0, | |
| "reward": 3.086591437458992, | |
| "reward_std": 6.208359390497208, | |
| "rewards/RewardModelWrapper/mean": 3.086591437458992, | |
| "rewards/RewardModelWrapper/std": 6.8491051197052, | |
| "step": 3900 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018020967768970875, | |
| "clip_ratio/high_mean": 0.018020967768970875, | |
| "clip_ratio/low_mean": 0.006037966601434163, | |
| "clip_ratio/low_min": 0.006037966601434163, | |
| "clip_ratio/region_mean": 0.024058934384956956, | |
| "completions/clipped_ratio": 0.953125, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 103.76470588235294, | |
| "completions/mean_length": 126.11305147058823, | |
| "completions/mean_terminated_length": 84.38186331356273, | |
| "completions/min_length": 66.11764705882354, | |
| "completions/min_terminated_length": 58.588235294117645, | |
| "epoch": 0.8792431830829159, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.74282169342041, | |
| "kl": 1.432164865732193, | |
| "learning_rate": 1.7229683587525609e-06, | |
| "loss": 0.4134, | |
| "num_tokens": 12857386.0, | |
| "reward": 3.5356551899629483, | |
| "reward_std": 5.877786804648006, | |
| "rewards/RewardModelWrapper/mean": 3.5356551899629483, | |
| "rewards/RewardModelWrapper/std": 6.742880484637092, | |
| "step": 3950 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.016392124033300207, | |
| "clip_ratio/high_mean": 0.016392124033300207, | |
| "clip_ratio/low_mean": 0.00735437709663529, | |
| "clip_ratio/low_min": 0.00735437709663529, | |
| "clip_ratio/region_mean": 0.02374650107929483, | |
| "completions/clipped_ratio": 0.9669117647058824, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 95.0, | |
| "completions/mean_length": 126.86764705882354, | |
| "completions/mean_terminated_length": 82.9313735961914, | |
| "completions/min_length": 84.29411764705883, | |
| "completions/min_terminated_length": 69.23529411764706, | |
| "epoch": 0.8903728436282693, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.2612786293029785, | |
| "kl": 1.4968037492036819, | |
| "learning_rate": 1.7058957432278626e-06, | |
| "loss": 0.4371, | |
| "num_tokens": 13025050.0, | |
| "reward": 3.9833039676441864, | |
| "reward_std": 5.820403575897217, | |
| "rewards/RewardModelWrapper/mean": 3.9833039676441864, | |
| "rewards/RewardModelWrapper/std": 6.59747979220222, | |
| "step": 4000 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013542763022705913, | |
| "clip_ratio/high_mean": 0.013542763022705913, | |
| "clip_ratio/low_mean": 0.007844352710526437, | |
| "clip_ratio/low_min": 0.007844352710526437, | |
| "clip_ratio/region_mean": 0.021387115789111705, | |
| "completions/clipped_ratio": 0.9755859375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 82.0, | |
| "completions/mean_length": 127.052734375, | |
| "completions/mean_terminated_length": 69.609375, | |
| "completions/min_length": 89.8125, | |
| "completions/min_terminated_length": 57.8125, | |
| "epoch": 0.9015025041736227, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 9.22656536102295, | |
| "kl": 1.486895147562027, | |
| "learning_rate": 1.6888231277031642e-06, | |
| "loss": 0.4339, | |
| "num_tokens": 13182896.0, | |
| "reward": 3.8604883551597595, | |
| "reward_std": 5.920006081461906, | |
| "rewards/RewardModelWrapper/mean": 3.8604883551597595, | |
| "rewards/RewardModelWrapper/std": 6.682152062654495, | |
| "step": 4050 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017551230599638076, | |
| "clip_ratio/high_mean": 0.017551230599638076, | |
| "clip_ratio/low_mean": 0.006257881603378337, | |
| "clip_ratio/low_min": 0.006257881603378337, | |
| "clip_ratio/region_mean": 0.023809112217277287, | |
| "completions/clipped_ratio": 0.9604779411764706, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 103.47058823529412, | |
| "completions/mean_length": 126.76011029411765, | |
| "completions/mean_terminated_length": 90.72815165800206, | |
| "completions/min_length": 81.47058823529412, | |
| "completions/min_terminated_length": 73.94117647058823, | |
| "epoch": 0.9126321647189761, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.342564105987549, | |
| "kl": 1.479159579873085, | |
| "learning_rate": 1.6717505121784657e-06, | |
| "loss": 0.434, | |
| "num_tokens": 13349539.0, | |
| "reward": 3.891232869204353, | |
| "reward_std": 5.906516776365392, | |
| "rewards/RewardModelWrapper/mean": 3.891232869204353, | |
| "rewards/RewardModelWrapper/std": 6.87522164513083, | |
| "step": 4100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.017731820455519482, | |
| "clip_ratio/high_mean": 0.017731820455519482, | |
| "clip_ratio/low_mean": 0.0037902081329957583, | |
| "clip_ratio/low_min": 0.0037902081329957583, | |
| "clip_ratio/region_mean": 0.021522028532344847, | |
| "completions/clipped_ratio": 0.9632352941176471, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 102.11764705882354, | |
| "completions/mean_length": 126.3373161764706, | |
| "completions/mean_terminated_length": 80.36274584601907, | |
| "completions/min_length": 61.588235294117645, | |
| "completions/min_terminated_length": 54.05882352941177, | |
| "epoch": 0.9237618252643295, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 8.095427513122559, | |
| "kl": 1.5771301573514938, | |
| "learning_rate": 1.6546778966537674e-06, | |
| "loss": 0.4627, | |
| "num_tokens": 13516058.0, | |
| "reward": 4.4532030890969665, | |
| "reward_std": 5.776824221891515, | |
| "rewards/RewardModelWrapper/mean": 4.4532030890969665, | |
| "rewards/RewardModelWrapper/std": 6.367258969475241, | |
| "step": 4150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018110398813150824, | |
| "clip_ratio/high_mean": 0.018110398813150824, | |
| "clip_ratio/low_mean": 0.006745649516233243, | |
| "clip_ratio/low_min": 0.006745649516233243, | |
| "clip_ratio/region_mean": 0.024856048391666264, | |
| "completions/clipped_ratio": 0.9609375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 93.9375, | |
| "completions/mean_length": 126.3310546875, | |
| "completions/mean_terminated_length": 81.33363127708435, | |
| "completions/min_length": 74.75, | |
| "completions/min_terminated_length": 66.75, | |
| "epoch": 0.9348914858096828, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.7098264694213867, | |
| "kl": 1.4337137299776077, | |
| "learning_rate": 1.637605281129069e-06, | |
| "loss": 0.4139, | |
| "num_tokens": 13673549.0, | |
| "reward": 3.717156395316124, | |
| "reward_std": 5.887754291296005, | |
| "rewards/RewardModelWrapper/mean": 3.717156395316124, | |
| "rewards/RewardModelWrapper/std": 6.542896807193756, | |
| "step": 4200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01609679988003336, | |
| "clip_ratio/high_mean": 0.01609679988003336, | |
| "clip_ratio/low_mean": 0.006251108425203711, | |
| "clip_ratio/low_min": 0.006251108425203711, | |
| "clip_ratio/region_mean": 0.022347908235387876, | |
| "completions/clipped_ratio": 0.9466911764705882, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 108.3529411764706, | |
| "completions/mean_length": 126.234375, | |
| "completions/mean_terminated_length": 89.88039308435776, | |
| "completions/min_length": 68.6470588235294, | |
| "completions/min_terminated_length": 61.11764705882353, | |
| "epoch": 0.9460211463550362, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.461524963378906, | |
| "kl": 1.4435530692338943, | |
| "learning_rate": 1.6205326656043705e-06, | |
| "loss": 0.4174, | |
| "num_tokens": 13839820.0, | |
| "reward": 3.5667920813840976, | |
| "reward_std": 5.679576621336095, | |
| "rewards/RewardModelWrapper/mean": 3.5667920813840976, | |
| "rewards/RewardModelWrapper/std": 6.743907311383416, | |
| "step": 4250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01591621272964403, | |
| "clip_ratio/high_mean": 0.01591621272964403, | |
| "clip_ratio/low_mean": 0.005297647488187067, | |
| "clip_ratio/low_min": 0.005297647488187067, | |
| "clip_ratio/region_mean": 0.021213860225398094, | |
| "completions/clipped_ratio": 0.9669117647058824, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 100.76470588235294, | |
| "completions/mean_length": 126.63602941176471, | |
| "completions/mean_terminated_length": 87.36666780359604, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "epoch": 0.9571508069003896, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 8.245085716247559, | |
| "kl": 1.4910035210847854, | |
| "learning_rate": 1.603460050079672e-06, | |
| "loss": 0.4312, | |
| "num_tokens": 14007144.0, | |
| "reward": 4.034057981827679, | |
| "reward_std": 5.743304505067713, | |
| "rewards/RewardModelWrapper/mean": 4.034057981827679, | |
| "rewards/RewardModelWrapper/std": 6.6319817094241875, | |
| "step": 4300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0162072420923505, | |
| "clip_ratio/high_mean": 0.0162072420923505, | |
| "clip_ratio/low_mean": 0.00646918074140558, | |
| "clip_ratio/low_min": 0.00646918074140558, | |
| "clip_ratio/region_mean": 0.022676422880031168, | |
| "completions/clipped_ratio": 0.9560546875, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 97.0, | |
| "completions/mean_length": 126.1572265625, | |
| "completions/mean_terminated_length": 80.61093807220459, | |
| "completions/min_length": 69.625, | |
| "completions/min_terminated_length": 61.625, | |
| "epoch": 0.9682804674457429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.9271068572998047, | |
| "kl": 1.5564635121822357, | |
| "learning_rate": 1.5863874345549738e-06, | |
| "loss": 0.4481, | |
| "num_tokens": 14163497.0, | |
| "reward": 4.418118596076965, | |
| "reward_std": 5.663649529218674, | |
| "rewards/RewardModelWrapper/mean": 4.418118596076965, | |
| "rewards/RewardModelWrapper/std": 6.5488221347332, | |
| "step": 4350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015229720452334733, | |
| "clip_ratio/high_mean": 0.015229720452334733, | |
| "clip_ratio/low_mean": 0.005334880515874829, | |
| "clip_ratio/low_min": 0.005334880515874829, | |
| "clip_ratio/region_mean": 0.020564600981306285, | |
| "completions/clipped_ratio": 0.9411764705882353, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 101.0, | |
| "completions/mean_length": 124.7876838235294, | |
| "completions/mean_terminated_length": 74.75882474113914, | |
| "completions/min_length": 50.35294117647059, | |
| "completions/min_terminated_length": 50.35294117647059, | |
| "epoch": 0.9794101279910963, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.419699192047119, | |
| "kl": 1.4550988680124284, | |
| "learning_rate": 1.5693148190302755e-06, | |
| "loss": 0.4187, | |
| "num_tokens": 14328034.0, | |
| "reward": 3.752244500552907, | |
| "reward_std": 5.818949124392341, | |
| "rewards/RewardModelWrapper/mean": 3.752244500552907, | |
| "rewards/RewardModelWrapper/std": 6.797629524679745, | |
| "step": 4400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.018021058345912024, | |
| "clip_ratio/high_mean": 0.018021058345912024, | |
| "clip_ratio/low_mean": 0.0030438171711284667, | |
| "clip_ratio/low_min": 0.0030438171711284667, | |
| "clip_ratio/region_mean": 0.021064875536831097, | |
| "completions/clipped_ratio": 0.9549632352941176, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 96.23529411764706, | |
| "completions/mean_length": 126.15900735294117, | |
| "completions/mean_terminated_length": 79.46218647676356, | |
| "completions/min_length": 68.05882352941177, | |
| "completions/min_terminated_length": 60.529411764705884, | |
| "epoch": 0.9905397885364496, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.946506977081299, | |
| "kl": 1.4544246417284012, | |
| "learning_rate": 1.5522422035055773e-06, | |
| "loss": 0.4204, | |
| "num_tokens": 14494631.0, | |
| "reward": 3.687691057429594, | |
| "reward_std": 5.869795238270479, | |
| "rewards/RewardModelWrapper/mean": 3.687691057429594, | |
| "rewards/RewardModelWrapper/std": 6.839460316826315, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.9996661101836394, | |
| "eval_clip_ratio/high_max": 0.0, | |
| "eval_clip_ratio/high_mean": 0.0, | |
| "eval_clip_ratio/low_mean": 0.0, | |
| "eval_clip_ratio/low_min": 0.0, | |
| "eval_clip_ratio/region_mean": 0.0, | |
| "eval_completions/clipped_ratio": 0.9438291139240507, | |
| "eval_completions/max_length": 128.0, | |
| "eval_completions/max_terminated_length": 53.129746835443036, | |
| "eval_completions/mean_length": 125.40446993670886, | |
| "eval_completions/mean_terminated_length": 48.271835556513146, | |
| "eval_completions/min_length": 96.05696202531645, | |
| "eval_completions/min_terminated_length": 43.39873417721519, | |
| "eval_frac_reward_zero_std": 0.0, | |
| "eval_kl": 1.4363023352019395, | |
| "eval_loss": 0.41118884086608887, | |
| "eval_num_tokens": 14622004.0, | |
| "eval_reward": 3.463206129738047, | |
| "eval_reward_std": 6.040495253722124, | |
| "eval_rewards/RewardModelWrapper/mean": 3.463206129738047, | |
| "eval_rewards/RewardModelWrapper/std": 6.557550964476187, | |
| "eval_runtime": 1430.6223, | |
| "eval_samples_per_second": 0.441, | |
| "eval_steps_per_second": 0.028, | |
| "step": 4491 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 8986, | |
| "num_input_tokens_seen": 14622004, | |
| "num_train_epochs": 2, | |
| "save_steps": 2696, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |