{ "best_global_step": 4491, "best_metric": 0.41118884086608887, "best_model_checkpoint": "models/grpo_toxic_qwen/checkpoint-4491", "epoch": 0.9996661101836394, "eval_steps": 2696, "global_step": 4491, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 128.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 119.59375, "completions/mean_terminated_length": 51.142860412597656, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.00022259321090706732, "frac_reward_zero_std": 0.0, "grad_norm": 5.030076026916504, "kl": 3.605344500101637e-05, "learning_rate": 0.0, "loss": -0.0286, "num_tokens": 9462.0, "reward": -6.696479797363281, "reward_std": 2.205897808074951, "rewards/RewardModelWrapper/mean": -6.696479797363281, "rewards/RewardModelWrapper/std": 2.596616506576538, "step": 1 }, { "clip_ratio/high_max": 0.00045590819666228654, "clip_ratio/high_mean": 0.00045590819666228654, "clip_ratio/low_mean": 9.893491918848333e-05, "clip_ratio/low_min": 9.893491918848333e-05, "clip_ratio/region_mean": 0.0005548431188205485, "completions/clipped_ratio": 0.91015625, "completions/max_length": 128.0, "completions/max_terminated_length": 115.375, "completions/mean_length": 124.541015625, "completions/mean_terminated_length": 88.15992164611816, "completions/min_length": 53.8125, "completions/min_terminated_length": 53.8125, "epoch": 0.011129660545353366, "frac_reward_zero_std": 0.0, "grad_norm": 4.771069526672363, "kl": 0.0014390296781742536, "learning_rate": 7.350000000000001e-07, "loss": -0.0097, "num_tokens": 164224.0, "reward": -6.273432105779648, "reward_std": 2.3787402510643005, "rewards/RewardModelWrapper/mean": -6.273432105779648, "rewards/RewardModelWrapper/std": 3.4789108261466026, "step": 50 }, { "clip_ratio/high_max": 0.0075913356387172825, "clip_ratio/high_mean": 0.0075913356387172825, "clip_ratio/low_mean": 0.003807623453612905, "clip_ratio/low_min": 0.003807623453612905, "clip_ratio/region_mean": 0.011398959086218383, "completions/clipped_ratio": 0.8915441176470589, "completions/max_length": 128.0, "completions/max_terminated_length": 110.17647058823529, "completions/mean_length": 123.2251838235294, "completions/mean_terminated_length": 81.49435559441062, "completions/min_length": 44.470588235294116, "completions/min_terminated_length": 44.470588235294116, "epoch": 0.022259321090706732, "frac_reward_zero_std": 0.0, "grad_norm": 4.885649681091309, "kl": 0.019224860495887695, "learning_rate": 1.485e-06, "loss": -0.0113, "num_tokens": 327613.0, "reward": -5.39674503663007, "reward_std": 2.7843008882859173, "rewards/RewardModelWrapper/mean": -5.39674503663007, "rewards/RewardModelWrapper/std": 3.8948283475988053, "step": 100 }, { "clip_ratio/high_max": 0.01675744824227877, "clip_ratio/high_mean": 0.01675744824227877, "clip_ratio/low_mean": 0.012073511610215065, "clip_ratio/low_min": 0.012073511610215065, "clip_ratio/region_mean": 0.028830959817860276, "completions/clipped_ratio": 0.9091796875, "completions/max_length": 128.0, "completions/max_terminated_length": 107.3125, "completions/mean_length": 124.3720703125, "completions/mean_terminated_length": 81.7018609046936, "completions/min_length": 54.6875, "completions/min_terminated_length": 46.6875, "epoch": 0.0333889816360601, "frac_reward_zero_std": 0.0, "grad_norm": 3.0026164054870605, "kl": 0.04671986572444439, "learning_rate": 2.235e-06, "loss": 0.0052, "num_tokens": 482354.0, "reward": -5.547765076160431, "reward_std": 2.73693485558033, "rewards/RewardModelWrapper/mean": -5.547765076160431, "rewards/RewardModelWrapper/std": 3.441145323216915, "step": 150 }, { "clip_ratio/high_max": 0.02414312065928243, "clip_ratio/high_mean": 0.02414312065928243, "clip_ratio/low_mean": 0.017463966414215975, "clip_ratio/low_min": 0.017463966414215975, "clip_ratio/region_mean": 0.04160708721727133, "completions/clipped_ratio": 0.9200367647058824, "completions/max_length": 128.0, "completions/max_terminated_length": 119.6470588235294, "completions/mean_length": 125.29503676470588, "completions/mean_terminated_length": 94.28872680664062, "completions/min_length": 65.58823529411765, "completions/min_terminated_length": 65.58823529411765, "epoch": 0.044518642181413465, "frac_reward_zero_std": 0.007352941176470588, "grad_norm": 3.9181442260742188, "kl": 0.0877579689398408, "learning_rate": 2.97e-06, "loss": 0.0105, "num_tokens": 648123.0, "reward": -4.304881698944989, "reward_std": 3.38148234872257, "rewards/RewardModelWrapper/mean": -4.304881698944989, "rewards/RewardModelWrapper/std": 4.617817443959853, "step": 200 }, { "clip_ratio/high_max": 0.029861916538793595, "clip_ratio/high_mean": 0.029861916538793595, "clip_ratio/low_mean": 0.023766413825796917, "clip_ratio/low_min": 0.023766413825796917, "clip_ratio/region_mean": 0.05362833026330918, "completions/clipped_ratio": 0.9172794117647058, "completions/max_length": 128.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 124.69761029411765, "completions/mean_terminated_length": 89.32857289033778, "completions/min_length": 53.23529411764706, "completions/min_terminated_length": 53.23529411764706, "epoch": 0.05564830272676683, "frac_reward_zero_std": 0.0, "grad_norm": 6.056563377380371, "kl": 0.14030909642577172, "learning_rate": 2.9836102890962895e-06, "loss": 0.0228, "num_tokens": 813050.0, "reward": -4.152413817013011, "reward_std": 3.3082274689393887, "rewards/RewardModelWrapper/mean": -4.152413817013011, "rewards/RewardModelWrapper/std": 4.367143616956823, "step": 250 }, { "clip_ratio/high_max": 0.030323101801332086, "clip_ratio/high_mean": 0.030323101801332086, "clip_ratio/low_mean": 0.021581946768565105, "clip_ratio/low_min": 0.021581946768565105, "clip_ratio/region_mean": 0.051905048433691266, "completions/clipped_ratio": 0.9248046875, "completions/max_length": 128.0, "completions/max_terminated_length": 116.1875, "completions/mean_length": 125.275390625, "completions/mean_terminated_length": 89.46597385406494, "completions/min_length": 56.5625, "completions/min_terminated_length": 56.5625, "epoch": 0.0667779632721202, "frac_reward_zero_std": 0.0, "grad_norm": 2.983767032623291, "kl": 0.1569075232744217, "learning_rate": 2.966537673571591e-06, "loss": 0.0317, "num_tokens": 969156.0, "reward": -3.388260453939438, "reward_std": 3.2063718885183334, "rewards/RewardModelWrapper/mean": -3.388260453939438, "rewards/RewardModelWrapper/std": 4.789341554045677, "step": 300 }, { "clip_ratio/high_max": 0.027688504084944724, "clip_ratio/high_mean": 0.027688504084944724, "clip_ratio/low_mean": 0.019530020136153327, "clip_ratio/low_min": 0.019530020136153327, "clip_ratio/region_mean": 0.04721852412912995, "completions/clipped_ratio": 0.9191176470588235, "completions/max_length": 128.0, "completions/max_terminated_length": 120.05882352941177, "completions/mean_length": 124.81709558823529, "completions/mean_terminated_length": 90.29201911477482, "completions/min_length": 52.64705882352941, "completions/min_terminated_length": 52.64705882352941, "epoch": 0.07790762381747357, "frac_reward_zero_std": 0.0, "grad_norm": 3.5877625942230225, "kl": 0.1643798241391778, "learning_rate": 2.9494650580468926e-06, "loss": 0.0293, "num_tokens": 1134229.0, "reward": -3.141713114345775, "reward_std": 3.3975463895236744, "rewards/RewardModelWrapper/mean": -3.141713114345775, "rewards/RewardModelWrapper/std": 4.820348431082333, "step": 350 }, { "clip_ratio/high_max": 0.028169492546003313, "clip_ratio/high_mean": 0.028169492546003313, "clip_ratio/low_mean": 0.019790295051643626, "clip_ratio/low_min": 0.019790295051643626, "clip_ratio/region_mean": 0.04795978774316609, "completions/clipped_ratio": 0.9310661764705882, "completions/max_length": 128.0, "completions/max_terminated_length": 108.11764705882354, "completions/mean_length": 125.64889705882354, "completions/mean_terminated_length": 87.01379753561581, "completions/min_length": 62.1764705882353, "completions/min_terminated_length": 54.64705882352941, "epoch": 0.08903728436282693, "frac_reward_zero_std": 0.0, "grad_norm": 5.695188999176025, "kl": 0.30343326754868033, "learning_rate": 2.933416799453676e-06, "loss": 0.0748, "num_tokens": 1300167.0, "reward": -3.474738233229693, "reward_std": 3.482299538219676, "rewards/RewardModelWrapper/mean": -3.474738233229693, "rewards/RewardModelWrapper/std": 4.745730189716115, "step": 400 }, { "clip_ratio/high_max": 0.029925933612976224, "clip_ratio/high_mean": 0.029925933612976224, "clip_ratio/low_mean": 0.019293442433699966, "clip_ratio/low_min": 0.019293442433699966, "clip_ratio/region_mean": 0.04921937589067966, "completions/clipped_ratio": 0.943359375, "completions/max_length": 128.0, "completions/max_terminated_length": 110.9375, "completions/mean_length": 126.4140625, "completions/mean_terminated_length": 95.56250047683716, "completions/min_length": 79.5, "completions/min_terminated_length": 71.5, "epoch": 0.1001669449081803, "frac_reward_zero_std": 0.0, "grad_norm": 3.636467218399048, "kl": 0.19514246992766857, "learning_rate": 2.9163441839289777e-06, "loss": 0.0415, "num_tokens": 1457135.0, "reward": -3.0616072714328766, "reward_std": 3.3436961472034454, "rewards/RewardModelWrapper/mean": -3.0616072714328766, "rewards/RewardModelWrapper/std": 4.945626050233841, "step": 450 }, { "clip_ratio/high_max": 0.027343249125406147, "clip_ratio/high_mean": 0.027343249125406147, "clip_ratio/low_mean": 0.01768903057440184, "clip_ratio/low_min": 0.01768903057440184, "clip_ratio/region_mean": 0.04503227963577956, "completions/clipped_ratio": 0.9393382352941176, "completions/max_length": 128.0, "completions/max_terminated_length": 109.47058823529412, "completions/mean_length": 126.07444852941177, "completions/mean_terminated_length": 92.27339037726907, "completions/min_length": 73.88235294117646, "completions/min_terminated_length": 66.3529411764706, "epoch": 0.11129660545353366, "frac_reward_zero_std": 0.0, "grad_norm": 3.624467134475708, "kl": 0.19471221148967743, "learning_rate": 2.8992715684042796e-06, "loss": 0.0459, "num_tokens": 1623608.0, "reward": -3.0403577299679028, "reward_std": 3.5023320422453037, "rewards/RewardModelWrapper/mean": -3.0403577299679028, "rewards/RewardModelWrapper/std": 4.758344790514777, "step": 500 }, { "clip_ratio/high_max": 0.026099461197154596, "clip_ratio/high_mean": 0.026099461197154596, "clip_ratio/low_mean": 0.01860616845311597, "clip_ratio/low_min": 0.01860616845311597, "clip_ratio/region_mean": 0.04470562972594053, "completions/clipped_ratio": 0.9209558823529411, "completions/max_length": 128.0, "completions/max_terminated_length": 111.3529411764706, "completions/mean_length": 125.32536764705883, "completions/mean_terminated_length": 86.94334905287799, "completions/min_length": 54.64705882352941, "completions/min_terminated_length": 47.11764705882353, "epoch": 0.12242626599888703, "frac_reward_zero_std": 0.0, "grad_norm": 5.117679119110107, "kl": 0.1926309671998024, "learning_rate": 2.882198952879581e-06, "loss": 0.0424, "num_tokens": 1789042.0, "reward": -3.364777831470265, "reward_std": 3.6073132402756634, "rewards/RewardModelWrapper/mean": -3.364777831470265, "rewards/RewardModelWrapper/std": 4.984979461221134, "step": 550 }, { "clip_ratio/high_max": 0.027654693657532335, "clip_ratio/high_mean": 0.027654693657532335, "clip_ratio/low_mean": 0.01964853117824532, "clip_ratio/low_min": 0.01964853117824532, "clip_ratio/region_mean": 0.047303224778734144, "completions/clipped_ratio": 0.8984375, "completions/max_length": 128.0, "completions/max_terminated_length": 124.5, "completions/mean_length": 124.734375, "completions/mean_terminated_length": 98.57239484786987, "completions/min_length": 61.25, "completions/min_terminated_length": 61.25, "epoch": 0.1335559265442404, "frac_reward_zero_std": 0.0, "grad_norm": 3.241142511367798, "kl": 0.211693402081728, "learning_rate": 2.865126337354883e-06, "loss": 0.0498, "num_tokens": 1944610.0, "reward": -2.732655808329582, "reward_std": 3.617541193962097, "rewards/RewardModelWrapper/mean": -2.732655808329582, "rewards/RewardModelWrapper/std": 4.809614151716232, "step": 600 }, { "clip_ratio/high_max": 0.027527469391934574, "clip_ratio/high_mean": 0.027527469391934574, "clip_ratio/low_mean": 0.019259323065634815, "clip_ratio/low_min": 0.019259323065634815, "clip_ratio/region_mean": 0.046786792553029956, "completions/clipped_ratio": 0.8933823529411765, "completions/max_length": 128.0, "completions/max_terminated_length": 121.47058823529412, "completions/mean_length": 124.16727941176471, "completions/mean_terminated_length": 96.0017848295324, "completions/min_length": 59.23529411764706, "completions/min_terminated_length": 59.23529411764706, "epoch": 0.14468558708959378, "frac_reward_zero_std": 0.0, "grad_norm": 4.524644374847412, "kl": 0.2376550894230604, "learning_rate": 2.8480537218301847e-06, "loss": 0.0528, "num_tokens": 2109128.0, "reward": -1.8917093557469986, "reward_std": 3.8112815267899456, "rewards/RewardModelWrapper/mean": -1.8917093557469986, "rewards/RewardModelWrapper/std": 5.167453260982738, "step": 650 }, { "clip_ratio/high_max": 0.027425415357574822, "clip_ratio/high_mean": 0.027425415357574822, "clip_ratio/low_mean": 0.01982414353871718, "clip_ratio/low_min": 0.01982414353871718, "clip_ratio/region_mean": 0.04724955870769918, "completions/clipped_ratio": 0.8602941176470589, "completions/max_length": 128.0, "completions/max_terminated_length": 121.05882352941177, "completions/mean_length": 123.19117647058823, "completions/mean_terminated_length": 94.26595889820771, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.15581524763494714, "frac_reward_zero_std": 0.0, "grad_norm": 5.250056743621826, "kl": 0.22009442321956157, "learning_rate": 2.830981106305486e-06, "loss": 0.044, "num_tokens": 2272320.0, "reward": -2.427665850695442, "reward_std": 3.78492192661061, "rewards/RewardModelWrapper/mean": -2.427665850695442, "rewards/RewardModelWrapper/std": 4.859750719631419, "step": 700 }, { "clip_ratio/high_max": 0.02454757507191971, "clip_ratio/high_mean": 0.02454757507191971, "clip_ratio/low_mean": 0.0160788345040055, "clip_ratio/low_min": 0.0160788345040055, "clip_ratio/region_mean": 0.04062640947755426, "completions/clipped_ratio": 0.8837890625, "completions/max_length": 128.0, "completions/max_terminated_length": 117.25, "completions/mean_length": 123.8193359375, "completions/mean_terminated_length": 92.61992502212524, "completions/min_length": 56.125, "completions/min_terminated_length": 56.125, "epoch": 0.1669449081803005, "frac_reward_zero_std": 0.0, "grad_norm": 6.143310070037842, "kl": 0.2187542901188135, "learning_rate": 2.8139084907807877e-06, "loss": 0.0458, "num_tokens": 2426567.0, "reward": -2.639084130525589, "reward_std": 4.0981148183345795, "rewards/RewardModelWrapper/mean": -2.639084130525589, "rewards/RewardModelWrapper/std": 5.267414927482605, "step": 750 }, { "clip_ratio/high_max": 0.023827595426701008, "clip_ratio/high_mean": 0.023827595426701008, "clip_ratio/low_mean": 0.01665229408070445, "clip_ratio/low_min": 0.01665229408070445, "clip_ratio/region_mean": 0.04047988944686949, "completions/clipped_ratio": 0.9172794117647058, "completions/max_length": 128.0, "completions/max_terminated_length": 117.41176470588235, "completions/mean_length": 124.52849264705883, "completions/mean_terminated_length": 86.05495004092946, "completions/min_length": 53.294117647058826, "completions/min_terminated_length": 53.294117647058826, "epoch": 0.17807456872565386, "frac_reward_zero_std": 0.0, "grad_norm": 3.82859206199646, "kl": 0.2341267079859972, "learning_rate": 2.7968358752560893e-06, "loss": 0.0495, "num_tokens": 2591422.0, "reward": -1.696559471242568, "reward_std": 4.100044530980727, "rewards/RewardModelWrapper/mean": -1.696559471242568, "rewards/RewardModelWrapper/std": 5.4215626155628875, "step": 800 }, { "clip_ratio/high_max": 0.025062179565429686, "clip_ratio/high_mean": 0.025062179565429686, "clip_ratio/low_mean": 0.018277215642156078, "clip_ratio/low_min": 0.018277215642156078, "clip_ratio/region_mean": 0.04333939506206661, "completions/clipped_ratio": 0.9292279411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 113.58823529411765, "completions/mean_length": 125.45588235294117, "completions/mean_terminated_length": 91.19166834214154, "completions/min_length": 58.94117647058823, "completions/min_terminated_length": 58.94117647058823, "epoch": 0.18920422927100725, "frac_reward_zero_std": 0.0, "grad_norm": 4.91913366317749, "kl": 0.22518106378614902, "learning_rate": 2.779763259731391e-06, "loss": 0.0531, "num_tokens": 2757254.0, "reward": -0.3187214837354772, "reward_std": 5.127424436457017, "rewards/RewardModelWrapper/mean": -0.3187214837354772, "rewards/RewardModelWrapper/std": 5.87655990263995, "step": 850 }, { "clip_ratio/high_max": 0.02293195443926379, "clip_ratio/high_mean": 0.02293195443926379, "clip_ratio/low_mean": 0.017691890239948407, "clip_ratio/low_min": 0.017691890239948407, "clip_ratio/region_mean": 0.040623844610527156, "completions/clipped_ratio": 0.9091796875, "completions/max_length": 128.0, "completions/max_terminated_length": 119.875, "completions/mean_length": 124.306640625, "completions/mean_terminated_length": 88.5287561416626, "completions/min_length": 47.5, "completions/min_terminated_length": 47.5, "epoch": 0.2003338898163606, "frac_reward_zero_std": 0.0, "grad_norm": 2.8705227375030518, "kl": 0.23920009069144726, "learning_rate": 2.7626906442066923e-06, "loss": 0.0608, "num_tokens": 2912304.0, "reward": -0.5163702219724655, "reward_std": 5.298731863498688, "rewards/RewardModelWrapper/mean": -0.5163702219724655, "rewards/RewardModelWrapper/std": 5.84825000166893, "step": 900 }, { "clip_ratio/high_max": 0.02397001946810633, "clip_ratio/high_mean": 0.02397001946810633, "clip_ratio/low_mean": 0.016966249566758053, "clip_ratio/low_min": 0.016966249566758053, "clip_ratio/region_mean": 0.040936269152443854, "completions/clipped_ratio": 0.9053308823529411, "completions/max_length": 128.0, "completions/max_terminated_length": 112.17647058823529, "completions/mean_length": 124.65533088235294, "completions/mean_terminated_length": 90.35452988568474, "completions/min_length": 57.64705882352941, "completions/min_terminated_length": 57.64705882352941, "epoch": 0.21146355036171396, "frac_reward_zero_std": 0.0, "grad_norm": 4.009652137756348, "kl": 0.28723155200481415, "learning_rate": 2.7456180286819943e-06, "loss": 0.0623, "num_tokens": 3077033.0, "reward": 0.3360518918317907, "reward_std": 5.125342537375057, "rewards/RewardModelWrapper/mean": 0.3360518918317907, "rewards/RewardModelWrapper/std": 5.78782990399529, "step": 950 }, { "clip_ratio/high_max": 0.025908510715235023, "clip_ratio/high_mean": 0.025908510715235023, "clip_ratio/low_mean": 0.017599179263343104, "clip_ratio/low_min": 0.017599179263343104, "clip_ratio/region_mean": 0.04350769010838121, "completions/clipped_ratio": 0.9172794117647058, "completions/max_length": 128.0, "completions/max_terminated_length": 121.41176470588235, "completions/mean_length": 125.03125, "completions/mean_terminated_length": 94.79173772475299, "completions/min_length": 61.88235294117647, "completions/min_terminated_length": 61.88235294117647, "epoch": 0.22259321090706732, "frac_reward_zero_std": 0.0, "grad_norm": 7.159637928009033, "kl": 0.35069699488580225, "learning_rate": 2.728545413157296e-06, "loss": 0.0847, "num_tokens": 3242123.0, "reward": 1.627946559120627, "reward_std": 4.790118554059197, "rewards/RewardModelWrapper/mean": 1.627946559120627, "rewards/RewardModelWrapper/std": 5.393552022821763, "step": 1000 }, { "clip_ratio/high_max": 0.024083305108360945, "clip_ratio/high_mean": 0.024083305108360945, "clip_ratio/low_mean": 0.013416973181592766, "clip_ratio/low_min": 0.013416973181592766, "clip_ratio/region_mean": 0.0375002783536911, "completions/clipped_ratio": 0.9267578125, "completions/max_length": 128.0, "completions/max_terminated_length": 112.8125, "completions/mean_length": 125.0126953125, "completions/mean_terminated_length": 87.55602884292603, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.2337228714524207, "frac_reward_zero_std": 0.0, "grad_norm": 5.147945880889893, "kl": 0.3470122530311346, "learning_rate": 2.7114727976325973e-06, "loss": 0.089, "num_tokens": 3397872.0, "reward": 0.24295206367969513, "reward_std": 5.033507749438286, "rewards/RewardModelWrapper/mean": 0.24295206367969513, "rewards/RewardModelWrapper/std": 5.808434098958969, "step": 1050 }, { "clip_ratio/high_max": 0.024642590049188583, "clip_ratio/high_mean": 0.024642590049188583, "clip_ratio/low_mean": 0.013819608901976609, "clip_ratio/low_min": 0.013819608901976609, "clip_ratio/region_mean": 0.0384621987817809, "completions/clipped_ratio": 0.9264705882352942, "completions/max_length": 128.0, "completions/max_terminated_length": 112.70588235294117, "completions/mean_length": 125.2408088235294, "completions/mean_terminated_length": 90.793908960679, "completions/min_length": 65.17647058823529, "completions/min_terminated_length": 65.17647058823529, "epoch": 0.24485253199777407, "frac_reward_zero_std": 0.0, "grad_norm": 3.4217491149902344, "kl": 0.377669473439455, "learning_rate": 2.694400182107899e-06, "loss": 0.0977, "num_tokens": 3563110.0, "reward": 0.4081239700317383, "reward_std": 5.032071225783405, "rewards/RewardModelWrapper/mean": 0.4081239700317383, "rewards/RewardModelWrapper/std": 5.893623436198515, "step": 1100 }, { "clip_ratio/high_max": 0.022081555526237934, "clip_ratio/high_mean": 0.022081555526237934, "clip_ratio/low_mean": 0.015956819643906783, "clip_ratio/low_min": 0.015956819643906783, "clip_ratio/region_mean": 0.038038374953903255, "completions/clipped_ratio": 0.9365808823529411, "completions/max_length": 128.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 125.22426470588235, "completions/mean_terminated_length": 73.54575303021599, "completions/min_length": 63.94117647058823, "completions/min_terminated_length": 48.88235294117647, "epoch": 0.25598219254312743, "frac_reward_zero_std": 0.0, "grad_norm": 4.748858451843262, "kl": 0.42920990511775015, "learning_rate": 2.677327566583201e-06, "loss": 0.1117, "num_tokens": 3728050.0, "reward": 1.730754810221055, "reward_std": 4.819248423856847, "rewards/RewardModelWrapper/mean": 1.730754810221055, "rewards/RewardModelWrapper/std": 5.504547006943646, "step": 1150 }, { "clip_ratio/high_max": 0.02080470887827687, "clip_ratio/high_mean": 0.02080470887827687, "clip_ratio/low_mean": 0.01475024281651713, "clip_ratio/low_min": 0.01475024281651713, "clip_ratio/region_mean": 0.03555495172040537, "completions/clipped_ratio": 0.962890625, "completions/max_length": 128.0, "completions/max_terminated_length": 98.625, "completions/mean_length": 126.416015625, "completions/mean_terminated_length": 81.55208349227905, "completions/min_length": 72.375, "completions/min_terminated_length": 64.375, "epoch": 0.2671118530884808, "frac_reward_zero_std": 0.0, "grad_norm": 3.4883859157562256, "kl": 0.42720063477754594, "learning_rate": 2.6602549510585024e-06, "loss": 0.1195, "num_tokens": 3884876.0, "reward": 1.9971511512994766, "reward_std": 4.786640420556068, "rewards/RewardModelWrapper/mean": 1.9971511512994766, "rewards/RewardModelWrapper/std": 5.766968697309494, "step": 1200 }, { "clip_ratio/high_max": 0.023233274864032864, "clip_ratio/high_mean": 0.023233274864032864, "clip_ratio/low_mean": 0.01158983559376793, "clip_ratio/low_min": 0.01158983559376793, "clip_ratio/region_mean": 0.03482311038998887, "completions/clipped_ratio": 0.9347426470588235, "completions/max_length": 128.0, "completions/max_terminated_length": 114.17647058823529, "completions/mean_length": 125.17738970588235, "completions/mean_terminated_length": 89.91648954503677, "completions/min_length": 64.29411764705883, "completions/min_terminated_length": 64.29411764705883, "epoch": 0.27824151363383415, "frac_reward_zero_std": 0.0, "grad_norm": 3.5916192531585693, "kl": 0.3887044958770275, "learning_rate": 2.643182335533804e-06, "loss": 0.1015, "num_tokens": 4050013.0, "reward": 0.8245974989498362, "reward_std": 5.001701130586512, "rewards/RewardModelWrapper/mean": 0.8245974989498362, "rewards/RewardModelWrapper/std": 5.80830400130328, "step": 1250 }, { "clip_ratio/high_max": 0.020529154643882067, "clip_ratio/high_mean": 0.020529154643882067, "clip_ratio/low_mean": 0.015356352158414665, "clip_ratio/low_min": 0.015356352158414665, "clip_ratio/region_mean": 0.03588550680316985, "completions/clipped_ratio": 0.9512867647058824, "completions/max_length": 128.0, "completions/max_terminated_length": 100.76470588235294, "completions/mean_length": 125.86305147058823, "completions/mean_terminated_length": 81.50539353314568, "completions/min_length": 63.470588235294116, "completions/min_terminated_length": 55.94117647058823, "epoch": 0.28937117417918756, "frac_reward_zero_std": 0.0, "grad_norm": 6.05077600479126, "kl": 0.4404847612977028, "learning_rate": 2.6261097200091054e-06, "loss": 0.1208, "num_tokens": 4215832.0, "reward": 2.3118093013763428, "reward_std": 4.841920866685755, "rewards/RewardModelWrapper/mean": 2.3118093013763428, "rewards/RewardModelWrapper/std": 5.525171279907227, "step": 1300 }, { "clip_ratio/high_max": 0.024388792894314976, "clip_ratio/high_mean": 0.024388792894314976, "clip_ratio/low_mean": 0.015166401157330256, "clip_ratio/low_min": 0.015166401157330256, "clip_ratio/region_mean": 0.039555194084532556, "completions/clipped_ratio": 0.9345703125, "completions/max_length": 128.0, "completions/max_terminated_length": 111.5, "completions/mean_length": 125.7822265625, "completions/mean_terminated_length": 88.69479322433472, "completions/min_length": 64.0625, "completions/min_terminated_length": 56.0625, "epoch": 0.3005008347245409, "frac_reward_zero_std": 0.0, "grad_norm": 4.280036926269531, "kl": 0.4519739609956741, "learning_rate": 2.609037104484407e-06, "loss": 0.1237, "num_tokens": 4372361.0, "reward": 2.7925052791833878, "reward_std": 4.665284767746925, "rewards/RewardModelWrapper/mean": 2.7925052791833878, "rewards/RewardModelWrapper/std": 5.4118489027023315, "step": 1350 }, { "clip_ratio/high_max": 0.023544567436911166, "clip_ratio/high_mean": 0.023544567436911166, "clip_ratio/low_mean": 0.01299051069712732, "clip_ratio/low_min": 0.01299051069712732, "clip_ratio/region_mean": 0.03653507822658866, "completions/clipped_ratio": 0.9292279411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 111.82352941176471, "completions/mean_length": 124.65165441176471, "completions/mean_terminated_length": 82.68410469503964, "completions/min_length": 46.8235294117647, "completions/min_terminated_length": 46.8235294117647, "epoch": 0.3116304952698943, "frac_reward_zero_std": 0.0, "grad_norm": 3.7401034832000732, "kl": 0.4922306627035141, "learning_rate": 2.591964488959709e-06, "loss": 0.1315, "num_tokens": 4537846.0, "reward": 2.862899471731747, "reward_std": 4.948802695554845, "rewards/RewardModelWrapper/mean": 2.862899471731747, "rewards/RewardModelWrapper/std": 5.503605421851663, "step": 1400 }, { "clip_ratio/high_max": 0.025161673842230812, "clip_ratio/high_mean": 0.025161673842230812, "clip_ratio/low_mean": 0.012126781771657989, "clip_ratio/low_min": 0.012126781771657989, "clip_ratio/region_mean": 0.037288455746602264, "completions/clipped_ratio": 0.9292279411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 117.17647058823529, "completions/mean_length": 125.05882352941177, "completions/mean_terminated_length": 90.7926357493681, "completions/min_length": 62.11764705882353, "completions/min_terminated_length": 62.11764705882353, "epoch": 0.32276015581524764, "frac_reward_zero_std": 0.0, "grad_norm": 5.467922687530518, "kl": 0.46787314653396606, "learning_rate": 2.5748918734350105e-06, "loss": 0.1207, "num_tokens": 4702966.0, "reward": 1.3220273045932545, "reward_std": 4.946936158572926, "rewards/RewardModelWrapper/mean": 1.3220273045932545, "rewards/RewardModelWrapper/std": 5.816557715920841, "step": 1450 }, { "clip_ratio/high_max": 0.025372368972748516, "clip_ratio/high_mean": 0.025372368972748516, "clip_ratio/low_mean": 0.01208616121119121, "clip_ratio/low_min": 0.01208616121119121, "clip_ratio/region_mean": 0.03745853026397526, "completions/clipped_ratio": 0.9599609375, "completions/max_length": 128.0, "completions/max_terminated_length": 88.5625, "completions/mean_length": 126.3486328125, "completions/mean_terminated_length": 74.44687557220459, "completions/min_length": 73.875, "completions/min_terminated_length": 57.875, "epoch": 0.333889816360601, "frac_reward_zero_std": 0.0, "grad_norm": 3.3412301540374756, "kl": 0.5128468088805676, "learning_rate": 2.557819257910312e-06, "loss": 0.1397, "num_tokens": 4860219.0, "reward": 2.0575065165758133, "reward_std": 5.155221775174141, "rewards/RewardModelWrapper/mean": 2.0575065165758133, "rewards/RewardModelWrapper/std": 5.655524164438248, "step": 1500 }, { "clip_ratio/high_max": 0.023876634621992708, "clip_ratio/high_mean": 0.023876634621992708, "clip_ratio/low_mean": 0.013062482952955179, "clip_ratio/low_min": 0.013062482952955179, "clip_ratio/region_mean": 0.03693911746609956, "completions/clipped_ratio": 0.9604779411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 84.47058823529412, "completions/mean_length": 126.6001838235294, "completions/mean_terminated_length": 73.35098131965188, "completions/min_length": 92.58823529411765, "completions/min_terminated_length": 62.470588235294116, "epoch": 0.34501947690595436, "frac_reward_zero_std": 0.0, "grad_norm": 3.4934329986572266, "kl": 0.5125479310750961, "learning_rate": 2.5407466423856135e-06, "loss": 0.139, "num_tokens": 5027488.0, "reward": 2.7399597448461197, "reward_std": 4.745976616354549, "rewards/RewardModelWrapper/mean": 2.7399597448461197, "rewards/RewardModelWrapper/std": 5.300730144276338, "step": 1550 }, { "clip_ratio/high_max": 0.02330939914332703, "clip_ratio/high_mean": 0.02330939914332703, "clip_ratio/low_mean": 0.009550860303861555, "clip_ratio/low_min": 0.009550860303861555, "clip_ratio/region_mean": 0.032860259409062564, "completions/clipped_ratio": 0.9641544117647058, "completions/max_length": 128.0, "completions/max_terminated_length": 95.70588235294117, "completions/mean_length": 126.61397058823529, "completions/mean_terminated_length": 81.39460844152114, "completions/min_length": 81.88235294117646, "completions/min_terminated_length": 66.82352941176471, "epoch": 0.3561491374513077, "frac_reward_zero_std": 0.0, "grad_norm": 4.760233402252197, "kl": 0.5104251652956009, "learning_rate": 2.523674026860915e-06, "loss": 0.1396, "num_tokens": 5193996.0, "reward": 1.9389969741596895, "reward_std": 5.14070810991175, "rewards/RewardModelWrapper/mean": 1.9389969741596895, "rewards/RewardModelWrapper/std": 5.778058921589571, "step": 1600 }, { "clip_ratio/high_max": 0.023669966620218474, "clip_ratio/high_mean": 0.023669966620218474, "clip_ratio/low_mean": 0.012192065346171147, "clip_ratio/low_min": 0.012192065346171147, "clip_ratio/region_mean": 0.035862031998112796, "completions/clipped_ratio": 0.9599609375, "completions/max_length": 128.0, "completions/max_terminated_length": 105.625, "completions/mean_length": 126.4326171875, "completions/mean_terminated_length": 91.0947916507721, "completions/min_length": 77.5, "completions/min_terminated_length": 77.5, "epoch": 0.3672787979966611, "frac_reward_zero_std": 0.0, "grad_norm": 3.496079206466675, "kl": 0.5115452679991722, "learning_rate": 2.5066014113362166e-06, "loss": 0.1421, "num_tokens": 5350823.0, "reward": 2.4139109551906586, "reward_std": 4.767535001039505, "rewards/RewardModelWrapper/mean": 2.4139109551906586, "rewards/RewardModelWrapper/std": 5.584080070257187, "step": 1650 }, { "clip_ratio/high_max": 0.026008948455564677, "clip_ratio/high_mean": 0.026008948455564677, "clip_ratio/low_mean": 0.008926556244841777, "clip_ratio/low_min": 0.008926556244841777, "clip_ratio/region_mean": 0.0349355046171695, "completions/clipped_ratio": 0.9604779411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 105.94117647058823, "completions/mean_length": 126.34926470588235, "completions/mean_terminated_length": 89.24902052037856, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3784084585420145, "frac_reward_zero_std": 0.0, "grad_norm": 3.9608895778656006, "kl": 0.5186072036623954, "learning_rate": 2.489528795811518e-06, "loss": 0.1427, "num_tokens": 5517627.0, "reward": 1.1372435163049137, "reward_std": 5.190363294938031, "rewards/RewardModelWrapper/mean": 1.1372435163049137, "rewards/RewardModelWrapper/std": 5.777948155122645, "step": 1700 }, { "clip_ratio/high_max": 0.022060031631262973, "clip_ratio/high_mean": 0.022060031631262973, "clip_ratio/low_mean": 0.012312272182898596, "clip_ratio/low_min": 0.012312272182898596, "clip_ratio/region_mean": 0.03437230377923697, "completions/clipped_ratio": 0.9632352941176471, "completions/max_length": 128.0, "completions/max_terminated_length": 102.82352941176471, "completions/mean_length": 126.49540441176471, "completions/mean_terminated_length": 88.83088302612305, "completions/min_length": 76.3529411764706, "completions/min_terminated_length": 68.82352941176471, "epoch": 0.38953811908736785, "frac_reward_zero_std": 0.0, "grad_norm": 5.0580315589904785, "kl": 0.5839428542554379, "learning_rate": 2.4724561802868197e-06, "loss": 0.1614, "num_tokens": 5684102.0, "reward": 2.689769050654243, "reward_std": 4.625988932216869, "rewards/RewardModelWrapper/mean": 2.689769050654243, "rewards/RewardModelWrapper/std": 5.245194827809053, "step": 1750 }, { "clip_ratio/high_max": 0.02739144684630446, "clip_ratio/high_mean": 0.02739144684630446, "clip_ratio/low_mean": 0.012341015862475616, "clip_ratio/low_min": 0.012341015862475616, "clip_ratio/region_mean": 0.03973246271605604, "completions/clipped_ratio": 0.95703125, "completions/max_length": 128.0, "completions/max_terminated_length": 103.5625, "completions/mean_length": 126.060546875, "completions/mean_terminated_length": 83.71354246139526, "completions/min_length": 61.3125, "completions/min_terminated_length": 61.3125, "epoch": 0.4006677796327212, "frac_reward_zero_std": 0.0, "grad_norm": 2.872232437133789, "kl": 0.5619081328809261, "learning_rate": 2.4553835647621216e-06, "loss": 0.1509, "num_tokens": 5840596.0, "reward": 2.3125159442424774, "reward_std": 4.9210382997989655, "rewards/RewardModelWrapper/mean": 2.3125159442424774, "rewards/RewardModelWrapper/std": 5.377374470233917, "step": 1800 }, { "clip_ratio/high_max": 0.024034175912383944, "clip_ratio/high_mean": 0.024034175912383944, "clip_ratio/low_mean": 0.009776253007003107, "clip_ratio/low_min": 0.009776253007003107, "clip_ratio/region_mean": 0.03381042889552191, "completions/clipped_ratio": 0.9549632352941176, "completions/max_length": 128.0, "completions/max_terminated_length": 101.41176470588235, "completions/mean_length": 125.99724264705883, "completions/mean_terminated_length": 81.53921688304229, "completions/min_length": 65.23529411764706, "completions/min_terminated_length": 57.705882352941174, "epoch": 0.41179744017807457, "frac_reward_zero_std": 0.0, "grad_norm": 3.8941831588745117, "kl": 0.615523195117712, "learning_rate": 2.4383109492374236e-06, "loss": 0.1677, "num_tokens": 6007113.0, "reward": 2.1543740524965176, "reward_std": 5.060079883126652, "rewards/RewardModelWrapper/mean": 2.1543740524965176, "rewards/RewardModelWrapper/std": 5.475296539418838, "step": 1850 }, { "clip_ratio/high_max": 0.022410094959195704, "clip_ratio/high_mean": 0.022410094959195704, "clip_ratio/low_mean": 0.012868442094186321, "clip_ratio/low_min": 0.012868442094186321, "clip_ratio/region_mean": 0.035278537014964965, "completions/clipped_ratio": 0.9604779411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 91.94117647058823, "completions/mean_length": 126.30238970588235, "completions/mean_terminated_length": 79.55490246941062, "completions/min_length": 71.88235294117646, "completions/min_terminated_length": 64.3529411764706, "epoch": 0.42292710072342793, "frac_reward_zero_std": 0.0, "grad_norm": 5.654233932495117, "kl": 0.6534902662038803, "learning_rate": 2.421238333712725e-06, "loss": 0.1843, "num_tokens": 6174378.0, "reward": 2.3925238006255207, "reward_std": 4.879058487275067, "rewards/RewardModelWrapper/mean": 2.3925238006255207, "rewards/RewardModelWrapper/std": 5.418860211091883, "step": 1900 }, { "clip_ratio/high_max": 0.021921868621138856, "clip_ratio/high_mean": 0.021921868621138856, "clip_ratio/low_mean": 0.011612088698893786, "clip_ratio/low_min": 0.011612088698893786, "clip_ratio/region_mean": 0.03353395750047639, "completions/clipped_ratio": 0.94140625, "completions/max_length": 128.0, "completions/max_terminated_length": 107.3125, "completions/mean_length": 125.32421875, "completions/mean_terminated_length": 82.90129089355469, "completions/min_length": 54.5, "completions/min_terminated_length": 54.5, "epoch": 0.4340567612687813, "frac_reward_zero_std": 0.0, "grad_norm": 2.8473002910614014, "kl": 0.6599007929861546, "learning_rate": 2.4041657181880266e-06, "loss": 0.1769, "num_tokens": 6330166.0, "reward": 2.618502587080002, "reward_std": 4.749881863594055, "rewards/RewardModelWrapper/mean": 2.618502587080002, "rewards/RewardModelWrapper/std": 5.46898752450943, "step": 1950 }, { "clip_ratio/high_max": 0.02155641552293673, "clip_ratio/high_mean": 0.02155641552293673, "clip_ratio/low_mean": 0.009289601502241568, "clip_ratio/low_min": 0.009289601502241568, "clip_ratio/region_mean": 0.030846016986761243, "completions/clipped_ratio": 0.9568014705882353, "completions/max_length": 128.0, "completions/max_terminated_length": 101.88235294117646, "completions/mean_length": 126.37040441176471, "completions/mean_terminated_length": 83.31176578297334, "completions/min_length": 72.88235294117646, "completions/min_terminated_length": 65.3529411764706, "epoch": 0.44518642181413465, "frac_reward_zero_std": 0.0, "grad_norm": 3.8109655380249023, "kl": 0.7074493160843849, "learning_rate": 2.387093102663328e-06, "loss": 0.1974, "num_tokens": 6496761.0, "reward": 3.343541706309599, "reward_std": 4.7992883710300225, "rewards/RewardModelWrapper/mean": 3.343541706309599, "rewards/RewardModelWrapper/std": 5.472757451674518, "step": 2000 }, { "clip_ratio/high_max": 0.027084801244782283, "clip_ratio/high_mean": 0.027084801244782283, "clip_ratio/low_mean": 0.006253871699154843, "clip_ratio/low_min": 0.006253871699154843, "clip_ratio/region_mean": 0.03333867286099121, "completions/clipped_ratio": 0.9503676470588235, "completions/max_length": 128.0, "completions/max_terminated_length": 94.88235294117646, "completions/mean_length": 125.38786764705883, "completions/mean_terminated_length": 66.87544497321633, "completions/min_length": 58.1764705882353, "completions/min_terminated_length": 43.11764705882353, "epoch": 0.45631608235948806, "frac_reward_zero_std": 0.0, "grad_norm": 6.87650203704834, "kl": 0.6578731602430343, "learning_rate": 2.3700204871386297e-06, "loss": 0.1804, "num_tokens": 6662615.0, "reward": 1.8048853032729204, "reward_std": 5.33220240649055, "rewards/RewardModelWrapper/mean": 1.8048853032729204, "rewards/RewardModelWrapper/std": 5.8003731334910675, "step": 2050 }, { "clip_ratio/high_max": 0.022799394286703318, "clip_ratio/high_mean": 0.022799394286703318, "clip_ratio/low_mean": 0.008315351814671886, "clip_ratio/low_min": 0.008315351814671886, "clip_ratio/region_mean": 0.03111474617384374, "completions/clipped_ratio": 0.9609375, "completions/max_length": 128.0, "completions/max_terminated_length": 87.0625, "completions/mean_length": 125.9619140625, "completions/mean_terminated_length": 69.97916746139526, "completions/min_length": 70.25, "completions/min_terminated_length": 54.25, "epoch": 0.4674457429048414, "frac_reward_zero_std": 0.0, "grad_norm": 3.432864189147949, "kl": 0.7079492492973805, "learning_rate": 2.3529478716139312e-06, "loss": 0.1956, "num_tokens": 6819392.0, "reward": 2.50741083920002, "reward_std": 5.116829484701157, "rewards/RewardModelWrapper/mean": 2.50741083920002, "rewards/RewardModelWrapper/std": 5.797209560871124, "step": 2100 }, { "clip_ratio/high_max": 0.022400263713207094, "clip_ratio/high_mean": 0.022400263713207094, "clip_ratio/low_mean": 0.008116541813942604, "clip_ratio/low_min": 0.008116541813942604, "clip_ratio/region_mean": 0.030516805413644762, "completions/clipped_ratio": 0.9347426470588235, "completions/max_length": 128.0, "completions/max_terminated_length": 100.88235294117646, "completions/mean_length": 125.04779411764706, "completions/mean_terminated_length": 75.39313866110409, "completions/min_length": 51.1764705882353, "completions/min_terminated_length": 43.64705882352941, "epoch": 0.4785754034501948, "frac_reward_zero_std": 0.0, "grad_norm": 4.323696136474609, "kl": 0.7352806448936462, "learning_rate": 2.3358752560892328e-06, "loss": 0.1983, "num_tokens": 6984364.0, "reward": 2.347130256540635, "reward_std": 5.3196556708391975, "rewards/RewardModelWrapper/mean": 2.347130256540635, "rewards/RewardModelWrapper/std": 5.771211035111371, "step": 2150 }, { "clip_ratio/high_max": 0.021681377917993815, "clip_ratio/high_mean": 0.021681377917993815, "clip_ratio/low_mean": 0.01035779433674179, "clip_ratio/low_min": 0.01035779433674179, "clip_ratio/region_mean": 0.03203917214414105, "completions/clipped_ratio": 0.9466911764705882, "completions/max_length": 128.0, "completions/max_terminated_length": 109.29411764705883, "completions/mean_length": 125.82996323529412, "completions/mean_terminated_length": 90.63718593821807, "completions/min_length": 71.23529411764706, "completions/min_terminated_length": 71.23529411764706, "epoch": 0.48970506399554814, "frac_reward_zero_std": 0.0, "grad_norm": 4.768919944763184, "kl": 0.7503913494944573, "learning_rate": 2.3188026405645343e-06, "loss": 0.2126, "num_tokens": 7150483.0, "reward": 2.530949129777796, "reward_std": 5.144188319935518, "rewards/RewardModelWrapper/mean": 2.530949129777796, "rewards/RewardModelWrapper/std": 5.636184664333568, "step": 2200 }, { "clip_ratio/high_max": 0.02238714267965406, "clip_ratio/high_mean": 0.02238714267965406, "clip_ratio/low_mean": 0.008641490781737957, "clip_ratio/low_min": 0.008641490781737957, "clip_ratio/region_mean": 0.031028633578680454, "completions/clipped_ratio": 0.927734375, "completions/max_length": 128.0, "completions/max_terminated_length": 97.6875, "completions/mean_length": 125.126953125, "completions/mean_terminated_length": 79.61108827590942, "completions/min_length": 60.875, "completions/min_terminated_length": 52.875, "epoch": 0.5008347245409015, "frac_reward_zero_std": 0.0, "grad_norm": 4.544048309326172, "kl": 0.8463270646333695, "learning_rate": 2.3017300250398363e-06, "loss": 0.234, "num_tokens": 7305749.0, "reward": 3.097047299146652, "reward_std": 5.260514736175537, "rewards/RewardModelWrapper/mean": 3.097047299146652, "rewards/RewardModelWrapper/std": 5.711855351924896, "step": 2250 }, { "clip_ratio/high_max": 0.0228908458375372, "clip_ratio/high_mean": 0.0228908458375372, "clip_ratio/low_mean": 0.009188006882905029, "clip_ratio/low_min": 0.009188006882905029, "clip_ratio/region_mean": 0.03207885263953358, "completions/clipped_ratio": 0.96875, "completions/max_length": 128.0, "completions/max_terminated_length": 95.58823529411765, "completions/mean_length": 126.76011029411765, "completions/mean_terminated_length": 83.82843219532685, "completions/min_length": 76.82352941176471, "completions/min_terminated_length": 69.29411764705883, "epoch": 0.5119643850862549, "frac_reward_zero_std": 0.0, "grad_norm": 4.981594562530518, "kl": 0.8844366371631622, "learning_rate": 2.284657409515138e-06, "loss": 0.256, "num_tokens": 7472592.0, "reward": 3.071570908322054, "reward_std": 5.142256512361414, "rewards/RewardModelWrapper/mean": 3.071570908322054, "rewards/RewardModelWrapper/std": 5.772335641524371, "step": 2300 }, { "clip_ratio/high_max": 0.0240246270573698, "clip_ratio/high_mean": 0.0240246270573698, "clip_ratio/low_mean": 0.0069138467891025355, "clip_ratio/low_min": 0.0069138467891025355, "clip_ratio/region_mean": 0.03093847391428426, "completions/clipped_ratio": 0.9448529411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 104.76470588235294, "completions/mean_length": 125.68014705882354, "completions/mean_terminated_length": 84.08382460650276, "completions/min_length": 64.76470588235294, "completions/min_terminated_length": 57.23529411764706, "epoch": 0.5230940456316082, "frac_reward_zero_std": 0.0, "grad_norm": 5.83440637588501, "kl": 0.9639204081892967, "learning_rate": 2.2675847939904393e-06, "loss": 0.2686, "num_tokens": 7637628.0, "reward": 2.8617815410389618, "reward_std": 5.505871576421401, "rewards/RewardModelWrapper/mean": 2.8617815410389618, "rewards/RewardModelWrapper/std": 5.944927496068618, "step": 2350 }, { "clip_ratio/high_max": 0.02327756991609931, "clip_ratio/high_mean": 0.02327756991609931, "clip_ratio/low_mean": 0.011412573783891275, "clip_ratio/low_min": 0.011412573783891275, "clip_ratio/region_mean": 0.03469014364061877, "completions/clipped_ratio": 0.9521484375, "completions/max_length": 128.0, "completions/max_terminated_length": 112.875, "completions/mean_length": 126.125, "completions/mean_terminated_length": 91.77031326293945, "completions/min_length": 70.3125, "completions/min_terminated_length": 70.3125, "epoch": 0.5342237061769616, "frac_reward_zero_std": 0.0, "grad_norm": 4.203379154205322, "kl": 1.0460551810264587, "learning_rate": 2.250512178465741e-06, "loss": 0.299, "num_tokens": 7793988.0, "reward": 3.6445817947387695, "reward_std": 5.2445206344127655, "rewards/RewardModelWrapper/mean": 3.6445817947387695, "rewards/RewardModelWrapper/std": 5.754371851682663, "step": 2400 }, { "clip_ratio/high_max": 0.025683601254131647, "clip_ratio/high_mean": 0.025683601254131647, "clip_ratio/low_mean": 0.007094714913982898, "clip_ratio/low_min": 0.007094714913982898, "clip_ratio/region_mean": 0.032778316254261884, "completions/clipped_ratio": 0.9310661764705882, "completions/max_length": 128.0, "completions/max_terminated_length": 113.58823529411765, "completions/mean_length": 125.19117647058823, "completions/mean_terminated_length": 87.4491610807531, "completions/min_length": 52.588235294117645, "completions/min_terminated_length": 52.588235294117645, "epoch": 0.5453533667223149, "frac_reward_zero_std": 0.0, "grad_norm": 4.791234970092773, "kl": 1.0378785887360573, "learning_rate": 2.233439562941043e-06, "loss": 0.2908, "num_tokens": 7958828.0, "reward": 2.4743111414067886, "reward_std": 5.666090853074017, "rewards/RewardModelWrapper/mean": 2.4743111414067886, "rewards/RewardModelWrapper/std": 6.052795522353229, "step": 2450 }, { "clip_ratio/high_max": 0.022869902374222876, "clip_ratio/high_mean": 0.022869902374222876, "clip_ratio/low_mean": 0.010338929877325426, "clip_ratio/low_min": 0.010338929877325426, "clip_ratio/region_mean": 0.03320883221458644, "completions/clipped_ratio": 0.9448529411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 106.6470588235294, "completions/mean_length": 125.87040441176471, "completions/mean_terminated_length": 87.59656883688534, "completions/min_length": 75.76470588235294, "completions/min_terminated_length": 68.23529411764706, "epoch": 0.5564830272676683, "frac_reward_zero_std": 0.0, "grad_norm": 7.271297931671143, "kl": 1.135116419494152, "learning_rate": 2.2163669474163444e-06, "loss": 0.3229, "num_tokens": 8125183.0, "reward": 2.681288887472714, "reward_std": 5.512399000280044, "rewards/RewardModelWrapper/mean": 2.681288887472714, "rewards/RewardModelWrapper/std": 6.263462291044347, "step": 2500 }, { "clip_ratio/high_max": 0.024373745566699655, "clip_ratio/high_mean": 0.024373745566699655, "clip_ratio/low_mean": 0.007875631948991213, "clip_ratio/low_min": 0.007875631948991213, "clip_ratio/region_mean": 0.032249377460684625, "completions/clipped_ratio": 0.962890625, "completions/max_length": 128.0, "completions/max_terminated_length": 99.3125, "completions/mean_length": 126.6416015625, "completions/mean_terminated_length": 84.72916746139526, "completions/min_length": 75.75, "completions/min_terminated_length": 67.75, "epoch": 0.5676126878130217, "frac_reward_zero_std": 0.0, "grad_norm": 4.9515485763549805, "kl": 1.1067718014121055, "learning_rate": 2.199294331891646e-06, "loss": 0.3174, "num_tokens": 8282648.0, "reward": 2.5410157814621925, "reward_std": 5.60416579246521, "rewards/RewardModelWrapper/mean": 2.5410157814621925, "rewards/RewardModelWrapper/std": 6.249917358160019, "step": 2550 }, { "clip_ratio/high_max": 0.021070915756281464, "clip_ratio/high_mean": 0.021070915756281464, "clip_ratio/low_mean": 0.010609990251832641, "clip_ratio/low_min": 0.010609990251832641, "clip_ratio/region_mean": 0.03168090590508655, "completions/clipped_ratio": 0.9613970588235294, "completions/max_length": 128.0, "completions/max_terminated_length": 89.23529411764706, "completions/mean_length": 126.42463235294117, "completions/mean_terminated_length": 78.3034320158117, "completions/min_length": 81.52941176470588, "completions/min_terminated_length": 66.47058823529412, "epoch": 0.5787423483583751, "frac_reward_zero_std": 0.0, "grad_norm": 7.162291049957275, "kl": 1.2047522097826004, "learning_rate": 2.1822217163669474e-06, "loss": 0.3462, "num_tokens": 8449478.0, "reward": 3.0924135095932903, "reward_std": 5.470459377064424, "rewards/RewardModelWrapper/mean": 3.0924135095932903, "rewards/RewardModelWrapper/std": 6.024646282196045, "step": 2600 }, { "clip_ratio/high_max": 0.02261253957170993, "clip_ratio/high_mean": 0.02261253957170993, "clip_ratio/low_mean": 0.008833104789373466, "clip_ratio/low_min": 0.008833104789373466, "clip_ratio/region_mean": 0.0314456443907693, "completions/clipped_ratio": 0.9494485294117647, "completions/max_length": 128.0, "completions/max_terminated_length": 94.29411764705883, "completions/mean_length": 125.73161764705883, "completions/mean_terminated_length": 72.89117723352769, "completions/min_length": 66.52941176470588, "completions/min_terminated_length": 51.470588235294116, "epoch": 0.5898720089037285, "frac_reward_zero_std": 0.0, "grad_norm": 6.318300247192383, "kl": 1.2395999401807785, "learning_rate": 2.165149100842249e-06, "loss": 0.3561, "num_tokens": 8615194.0, "reward": 2.5635701067307415, "reward_std": 5.7780221490299, "rewards/RewardModelWrapper/mean": 2.5635701067307415, "rewards/RewardModelWrapper/std": 6.476823947008918, "step": 2650 }, { "clip_ratio/high_max": 0.02310706490650773, "clip_ratio/high_mean": 0.02310706490650773, "clip_ratio/low_mean": 0.008465991305129136, "clip_ratio/low_min": 0.008465991305129136, "clip_ratio/region_mean": 0.03157305620610714, "completions/clipped_ratio": 0.9462890625, "completions/max_length": 128.0, "completions/max_terminated_length": 107.625, "completions/mean_length": 125.8720703125, "completions/mean_terminated_length": 88.43675756454468, "completions/min_length": 65.875, "completions/min_terminated_length": 65.875, "epoch": 0.6010016694490818, "frac_reward_zero_std": 0.0, "grad_norm": 3.8498454093933105, "kl": 1.2743730303645133, "learning_rate": 2.148076485317551e-06, "loss": 0.3642, "num_tokens": 8771759.0, "reward": 3.0489635169506073, "reward_std": 5.676127910614014, "rewards/RewardModelWrapper/mean": 3.0489635169506073, "rewards/RewardModelWrapper/std": 6.18413832783699, "step": 2700 }, { "clip_ratio/high_max": 0.017379222289891912, "clip_ratio/high_mean": 0.017379222289891912, "clip_ratio/low_mean": 0.012123786294832826, "clip_ratio/low_min": 0.012123786294832826, "clip_ratio/region_mean": 0.029503008612664416, "completions/clipped_ratio": 0.9613970588235294, "completions/max_length": 128.0, "completions/max_terminated_length": 87.82352941176471, "completions/mean_length": 126.33823529411765, "completions/mean_terminated_length": 70.85490282844094, "completions/min_length": 75.94117647058823, "completions/min_terminated_length": 53.35294117647059, "epoch": 0.6121313299944352, "frac_reward_zero_std": 0.0, "grad_norm": 5.328779697418213, "kl": 1.5358505266904832, "learning_rate": 2.1320282267243345e-06, "loss": 0.44, "num_tokens": 8938623.0, "reward": 4.035378414041856, "reward_std": 5.26687082122354, "rewards/RewardModelWrapper/mean": 4.035378414041856, "rewards/RewardModelWrapper/std": 6.010923722211053, "step": 2750 }, { "clip_ratio/high_max": 0.02342768482863903, "clip_ratio/high_mean": 0.02342768482863903, "clip_ratio/low_mean": 0.007425281075702514, "clip_ratio/low_min": 0.007425281075702514, "clip_ratio/region_mean": 0.03085296612116508, "completions/clipped_ratio": 0.9476102941176471, "completions/max_length": 128.0, "completions/max_terminated_length": 102.82352941176471, "completions/mean_length": 125.83272058823529, "completions/mean_terminated_length": 83.69166744456572, "completions/min_length": 70.94117647058823, "completions/min_terminated_length": 63.411764705882355, "epoch": 0.6232609905397886, "frac_reward_zero_std": 0.0, "grad_norm": 13.94502067565918, "kl": 1.3090015414357186, "learning_rate": 2.114955611199636e-06, "loss": 0.3749, "num_tokens": 9104641.0, "reward": 3.50168057049022, "reward_std": 5.636927548576804, "rewards/RewardModelWrapper/mean": 3.50168057049022, "rewards/RewardModelWrapper/std": 6.223201779758229, "step": 2800 }, { "clip_ratio/high_max": 0.023096702507464217, "clip_ratio/high_mean": 0.023096702507464217, "clip_ratio/low_mean": 0.01079344226163812, "clip_ratio/low_min": 0.01079344226163812, "clip_ratio/region_mean": 0.033890144524630156, "completions/clipped_ratio": 0.947265625, "completions/max_length": 128.0, "completions/max_terminated_length": 111.125, "completions/mean_length": 126.0732421875, "completions/mean_terminated_length": 91.28541803359985, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.6343906510851419, "frac_reward_zero_std": 0.0, "grad_norm": 5.740921974182129, "kl": 1.2168986845016478, "learning_rate": 2.0978829956749376e-06, "loss": 0.3468, "num_tokens": 9260988.0, "reward": 2.903833270072937, "reward_std": 5.634722024202347, "rewards/RewardModelWrapper/mean": 2.903833270072937, "rewards/RewardModelWrapper/std": 6.182769417762756, "step": 2850 }, { "clip_ratio/high_max": 0.021131394968833775, "clip_ratio/high_mean": 0.021131394968833775, "clip_ratio/low_mean": 0.00905259191960795, "clip_ratio/low_min": 0.00905259191960795, "clip_ratio/region_mean": 0.03018398679094389, "completions/clipped_ratio": 0.9549632352941176, "completions/max_length": 128.0, "completions/max_terminated_length": 96.94117647058823, "completions/mean_length": 125.89613970588235, "completions/mean_terminated_length": 78.88333488913143, "completions/min_length": 67.82352941176471, "completions/min_terminated_length": 60.294117647058826, "epoch": 0.6455203116304953, "frac_reward_zero_std": 0.0, "grad_norm": 4.610988616943359, "kl": 1.332914224267006, "learning_rate": 2.080810380150239e-06, "loss": 0.3851, "num_tokens": 9427003.0, "reward": 3.4099216741674088, "reward_std": 5.599381278542912, "rewards/RewardModelWrapper/mean": 3.4099216741674088, "rewards/RewardModelWrapper/std": 6.283486815059886, "step": 2900 }, { "clip_ratio/high_max": 0.024977084384299814, "clip_ratio/high_mean": 0.024977084384299814, "clip_ratio/low_mean": 0.009850850635266396, "clip_ratio/low_min": 0.009850850635266396, "clip_ratio/region_mean": 0.034827935132198035, "completions/clipped_ratio": 0.9430147058823529, "completions/max_length": 128.0, "completions/max_terminated_length": 105.11764705882354, "completions/mean_length": 125.54503676470588, "completions/mean_terminated_length": 85.04131810805377, "completions/min_length": 64.41176470588235, "completions/min_terminated_length": 64.41176470588235, "epoch": 0.6566499721758486, "frac_reward_zero_std": 0.0, "grad_norm": 6.673405170440674, "kl": 1.34111887216568, "learning_rate": 2.0637377646255406e-06, "loss": 0.3787, "num_tokens": 9592884.0, "reward": 3.767064431134392, "reward_std": 5.628603626700008, "rewards/RewardModelWrapper/mean": 3.767064431134392, "rewards/RewardModelWrapper/std": 6.238466964048498, "step": 2950 }, { "clip_ratio/high_max": 0.019235485673416406, "clip_ratio/high_mean": 0.019235485673416406, "clip_ratio/low_mean": 0.008951259328168816, "clip_ratio/low_min": 0.008951259328168816, "clip_ratio/region_mean": 0.02818674497772008, "completions/clipped_ratio": 0.958984375, "completions/max_length": 128.0, "completions/max_terminated_length": 93.625, "completions/mean_length": 126.48046875, "completions/mean_terminated_length": 79.1166672706604, "completions/min_length": 76.125, "completions/min_terminated_length": 60.125, "epoch": 0.667779632721202, "frac_reward_zero_std": 0.0, "grad_norm": Infinity, "kl": 1.6551162710785865, "learning_rate": 2.0470066014113363e-06, "loss": 0.4809, "num_tokens": 9750288.0, "reward": 3.3632944226264954, "reward_std": 5.644728451967239, "rewards/RewardModelWrapper/mean": 3.3632944226264954, "rewards/RewardModelWrapper/std": 6.475361466407776, "step": 3000 }, { "clip_ratio/high_max": 0.021347561194561424, "clip_ratio/high_mean": 0.021347561194561424, "clip_ratio/low_mean": 0.012039180095889605, "clip_ratio/low_min": 0.012039180095889605, "clip_ratio/region_mean": 0.03338674116646871, "completions/clipped_ratio": 0.9641544117647058, "completions/max_length": 128.0, "completions/max_terminated_length": 84.29411764705883, "completions/mean_length": 126.47426470588235, "completions/mean_terminated_length": 67.78921643425437, "completions/min_length": 73.29411764705883, "completions/min_terminated_length": 50.705882352941174, "epoch": 0.6789092932665554, "frac_reward_zero_std": 0.0, "grad_norm": 4.413740158081055, "kl": 1.3862884595990181, "learning_rate": 2.030275438197132e-06, "loss": 0.4017, "num_tokens": 9917180.0, "reward": 3.722391970017377, "reward_std": 5.822299059699564, "rewards/RewardModelWrapper/mean": 3.722391970017377, "rewards/RewardModelWrapper/std": 6.463091822231517, "step": 3050 }, { "clip_ratio/high_max": 0.018999405660433694, "clip_ratio/high_mean": 0.018999405660433694, "clip_ratio/low_mean": 0.010441597908793484, "clip_ratio/low_min": 0.010441597908793484, "clip_ratio/region_mean": 0.029441003524698316, "completions/clipped_ratio": 0.9586397058823529, "completions/max_length": 128.0, "completions/max_terminated_length": 112.29411764705883, "completions/mean_length": 126.4623161764706, "completions/mean_terminated_length": 90.0686279745663, "completions/min_length": 63.94117647058823, "completions/min_terminated_length": 63.94117647058823, "epoch": 0.6900389538119087, "frac_reward_zero_std": 0.0, "grad_norm": 11.30611801147461, "kl": 1.3920823442935943, "learning_rate": 2.0132028226724335e-06, "loss": 0.4035, "num_tokens": 10083867.0, "reward": 3.71955924875596, "reward_std": 5.790389762205236, "rewards/RewardModelWrapper/mean": 3.71955924875596, "rewards/RewardModelWrapper/std": 6.5407993653241325, "step": 3100 }, { "clip_ratio/high_max": 0.02239516925183125, "clip_ratio/high_mean": 0.02239516925183125, "clip_ratio/low_mean": 0.010940310020523612, "clip_ratio/low_min": 0.010940310020523612, "clip_ratio/region_mean": 0.03333547928952612, "completions/clipped_ratio": 0.955078125, "completions/max_length": 128.0, "completions/max_terminated_length": 99.875, "completions/mean_length": 126.2353515625, "completions/mean_terminated_length": 82.07812547683716, "completions/min_length": 70.75, "completions/min_terminated_length": 62.75, "epoch": 0.7011686143572621, "frac_reward_zero_std": 0.0, "grad_norm": 8.098145484924316, "kl": 1.332285776436329, "learning_rate": 1.996130207147735e-06, "loss": 0.3818, "num_tokens": 10240948.0, "reward": 3.675293631851673, "reward_std": 5.620851904153824, "rewards/RewardModelWrapper/mean": 3.675293631851673, "rewards/RewardModelWrapper/std": 6.339143455028534, "step": 3150 }, { "clip_ratio/high_max": 0.017545219952007755, "clip_ratio/high_mean": 0.017545219952007755, "clip_ratio/low_mean": 0.006160206313361414, "clip_ratio/low_min": 0.006160206313361414, "clip_ratio/region_mean": 0.023705426228698343, "completions/clipped_ratio": 0.9540441176470589, "completions/max_length": 128.0, "completions/max_terminated_length": 105.05882352941177, "completions/mean_length": 126.03216911764706, "completions/mean_terminated_length": 81.38235316557042, "completions/min_length": 66.52941176470588, "completions/min_terminated_length": 59.0, "epoch": 0.7122982749026154, "frac_reward_zero_std": 0.0, "grad_norm": 4.446596622467041, "kl": 1.3602485132217408, "learning_rate": 1.9790575916230366e-06, "loss": 0.3915, "num_tokens": 10407047.0, "reward": 3.461222396177404, "reward_std": 5.5388546831467576, "rewards/RewardModelWrapper/mean": 3.461222396177404, "rewards/RewardModelWrapper/std": 6.420014409457936, "step": 3200 }, { "clip_ratio/high_max": 0.01795817382866517, "clip_ratio/high_mean": 0.01795817382866517, "clip_ratio/low_mean": 0.008432389081281143, "clip_ratio/low_min": 0.008432389081281143, "clip_ratio/region_mean": 0.026390562802553176, "completions/clipped_ratio": 0.9669117647058824, "completions/max_length": 128.0, "completions/max_terminated_length": 78.94117647058823, "completions/mean_length": 126.28033088235294, "completions/mean_terminated_length": 61.84313740449793, "completions/min_length": 68.05882352941177, "completions/min_terminated_length": 45.470588235294116, "epoch": 0.7234279354479688, "frac_reward_zero_std": 0.0, "grad_norm": 17.92909812927246, "kl": 1.4322891801595687, "learning_rate": 1.9619849760983386e-06, "loss": 0.4131, "num_tokens": 10573736.0, "reward": 3.6829915467430565, "reward_std": 5.790671881507425, "rewards/RewardModelWrapper/mean": 3.6829915467430565, "rewards/RewardModelWrapper/std": 6.5448582032147575, "step": 3250 }, { "clip_ratio/high_max": 0.01961003711214289, "clip_ratio/high_mean": 0.01961003711214289, "clip_ratio/low_mean": 0.010123618032957893, "clip_ratio/low_min": 0.010123618032957893, "clip_ratio/region_mean": 0.02973365513375029, "completions/clipped_ratio": 0.9736328125, "completions/max_length": 128.0, "completions/max_terminated_length": 89.75, "completions/mean_length": 127.12890625, "completions/mean_terminated_length": 78.04687547683716, "completions/min_length": 88.3125, "completions/min_terminated_length": 64.3125, "epoch": 0.7345575959933222, "frac_reward_zero_std": 0.0, "grad_norm": 3.4893033504486084, "kl": 1.3992696887254714, "learning_rate": 1.94491236057364e-06, "loss": 0.407, "num_tokens": 10732252.0, "reward": 4.159975051879883, "reward_std": 5.596900701522827, "rewards/RewardModelWrapper/mean": 4.159975051879883, "rewards/RewardModelWrapper/std": 6.406121611595154, "step": 3300 }, { "clip_ratio/high_max": 0.01751380935544148, "clip_ratio/high_mean": 0.01751380935544148, "clip_ratio/low_mean": 0.006701366908382625, "clip_ratio/low_min": 0.006701366908382625, "clip_ratio/region_mean": 0.02421517624054104, "completions/clipped_ratio": 0.9632352941176471, "completions/max_length": 128.0, "completions/max_terminated_length": 89.11764705882354, "completions/mean_length": 126.52113970588235, "completions/mean_terminated_length": 75.78823538387523, "completions/min_length": 73.58823529411765, "completions/min_terminated_length": 58.529411764705884, "epoch": 0.7456872565386756, "frac_reward_zero_std": 0.0, "grad_norm": 7.024867057800293, "kl": 1.48705244243145, "learning_rate": 1.9278397450489416e-06, "loss": 0.4302, "num_tokens": 10898899.0, "reward": 4.022455299601836, "reward_std": 6.009893417358398, "rewards/RewardModelWrapper/mean": 4.022455299601836, "rewards/RewardModelWrapper/std": 6.55277754278744, "step": 3350 }, { "clip_ratio/high_max": 0.020205343069974332, "clip_ratio/high_mean": 0.020205343069974332, "clip_ratio/low_mean": 0.008244332130707334, "clip_ratio/low_min": 0.008244332130707334, "clip_ratio/region_mean": 0.028449675207957624, "completions/clipped_ratio": 0.953125, "completions/max_length": 128.0, "completions/max_terminated_length": 83.52941176470588, "completions/mean_length": 125.67463235294117, "completions/mean_terminated_length": 65.16414619894589, "completions/min_length": 67.76470588235294, "completions/min_terminated_length": 45.1764705882353, "epoch": 0.756816917084029, "frac_reward_zero_std": 0.0, "grad_norm": 6.548982620239258, "kl": 1.4358280056715012, "learning_rate": 1.910767129524243e-06, "loss": 0.4127, "num_tokens": 11065097.0, "reward": 3.6614036700304817, "reward_std": 5.941182669471292, "rewards/RewardModelWrapper/mean": 3.6614036700304817, "rewards/RewardModelWrapper/std": 6.68101375243243, "step": 3400 }, { "clip_ratio/high_max": 0.018692465843632818, "clip_ratio/high_mean": 0.018692465843632818, "clip_ratio/low_mean": 0.008573709986812901, "clip_ratio/low_min": 0.008573709986812901, "clip_ratio/region_mean": 0.02726617576321587, "completions/clipped_ratio": 0.9541015625, "completions/max_length": 128.0, "completions/max_terminated_length": 98.8125, "completions/mean_length": 126.1083984375, "completions/mean_terminated_length": 77.7172622680664, "completions/min_length": 63.125, "completions/min_terminated_length": 55.125, "epoch": 0.7679465776293823, "frac_reward_zero_std": 0.0, "grad_norm": 3.6161601543426514, "kl": 1.4051894819736481, "learning_rate": 1.8936945139995447e-06, "loss": 0.4055, "num_tokens": 11221336.0, "reward": 2.737824946641922, "reward_std": 6.139679282903671, "rewards/RewardModelWrapper/mean": 2.737824946641922, "rewards/RewardModelWrapper/std": 6.881059348583221, "step": 3450 }, { "clip_ratio/high_max": 0.019909201117698103, "clip_ratio/high_mean": 0.019909201117698103, "clip_ratio/low_mean": 0.009944785697734914, "clip_ratio/low_min": 0.009944785697734914, "clip_ratio/region_mean": 0.029853986804373563, "completions/clipped_ratio": 0.9733455882352942, "completions/max_length": 128.0, "completions/max_terminated_length": 91.88235294117646, "completions/mean_length": 127.20036764705883, "completions/mean_terminated_length": 86.0049025591682, "completions/min_length": 94.52941176470588, "completions/min_terminated_length": 79.47058823529412, "epoch": 0.7790762381747357, "frac_reward_zero_std": 0.0, "grad_norm": 7.78251314163208, "kl": 1.4502297604084016, "learning_rate": 1.8766218984748462e-06, "loss": 0.4266, "num_tokens": 11389018.0, "reward": 4.499273047727697, "reward_std": 5.489500326268813, "rewards/RewardModelWrapper/mean": 4.499273047727697, "rewards/RewardModelWrapper/std": 6.2598629839280076, "step": 3500 }, { "clip_ratio/high_max": 0.01706919132906478, "clip_ratio/high_mean": 0.01706919132906478, "clip_ratio/low_mean": 0.007432717043848243, "clip_ratio/low_min": 0.007432717043848243, "clip_ratio/region_mean": 0.024501908438978717, "completions/clipped_ratio": 0.9568014705882353, "completions/max_length": 128.0, "completions/max_terminated_length": 91.47058823529412, "completions/mean_length": 126.015625, "completions/mean_terminated_length": 74.68823646096622, "completions/min_length": 61.76470588235294, "completions/min_terminated_length": 54.23529411764706, "epoch": 0.7902058987200891, "frac_reward_zero_std": 0.0, "grad_norm": 7.431344509124756, "kl": 1.4172208327054978, "learning_rate": 1.859549282950148e-06, "loss": 0.4053, "num_tokens": 11555371.0, "reward": 3.9203204547657684, "reward_std": 5.879987856921027, "rewards/RewardModelWrapper/mean": 3.9203204547657684, "rewards/RewardModelWrapper/std": 6.654794917387121, "step": 3550 }, { "clip_ratio/high_max": 0.017978638106724246, "clip_ratio/high_mean": 0.017978638106724246, "clip_ratio/low_mean": 0.008542120530910325, "clip_ratio/low_min": 0.008542120530910325, "clip_ratio/region_mean": 0.02652075860532932, "completions/clipped_ratio": 0.95703125, "completions/max_length": 128.0, "completions/max_terminated_length": 105.0625, "completions/mean_length": 126.1298828125, "completions/mean_terminated_length": 88.37500047683716, "completions/min_length": 68.9375, "completions/min_terminated_length": 68.9375, "epoch": 0.8013355592654424, "frac_reward_zero_std": 0.0, "grad_norm": 5.334597110748291, "kl": 1.357377045750618, "learning_rate": 1.8424766674254495e-06, "loss": 0.39, "num_tokens": 11712544.0, "reward": 3.334804505109787, "reward_std": 6.004520118236542, "rewards/RewardModelWrapper/mean": 3.334804505109787, "rewards/RewardModelWrapper/std": 6.608620345592499, "step": 3600 }, { "clip_ratio/high_max": 0.01815531796310097, "clip_ratio/high_mean": 0.01815531796310097, "clip_ratio/low_mean": 0.00551853927434422, "clip_ratio/low_min": 0.00551853927434422, "clip_ratio/region_mean": 0.023673857206013053, "completions/clipped_ratio": 0.9586397058823529, "completions/max_length": 128.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 126.57444852941177, "completions/mean_terminated_length": 91.43627570657169, "completions/min_length": 73.05882352941177, "completions/min_terminated_length": 73.05882352941177, "epoch": 0.8124652198107958, "frac_reward_zero_std": 0.0, "grad_norm": 4.32098913192749, "kl": 1.3787575218081474, "learning_rate": 1.825404051900751e-06, "loss": 0.406, "num_tokens": 11879329.0, "reward": 4.733595371246338, "reward_std": 5.286141087027157, "rewards/RewardModelWrapper/mean": 4.733595371246338, "rewards/RewardModelWrapper/std": 6.089175813338336, "step": 3650 }, { "clip_ratio/high_max": 0.018292159989941867, "clip_ratio/high_mean": 0.018292159989941867, "clip_ratio/low_mean": 0.00964461057272274, "clip_ratio/low_min": 0.00964461057272274, "clip_ratio/region_mean": 0.027936770617961883, "completions/clipped_ratio": 0.9347426470588235, "completions/max_length": 128.0, "completions/max_terminated_length": 100.94117647058823, "completions/mean_length": 125.18014705882354, "completions/mean_terminated_length": 79.63531673655791, "completions/min_length": 62.8235294117647, "completions/min_terminated_length": 55.294117647058826, "epoch": 0.8235948803561491, "frac_reward_zero_std": 0.0, "grad_norm": 11.81167984008789, "kl": 1.3568527114391327, "learning_rate": 1.8083314363760528e-06, "loss": 0.3856, "num_tokens": 12044285.0, "reward": 3.853144645690918, "reward_std": 5.8185105744530174, "rewards/RewardModelWrapper/mean": 3.853144645690918, "rewards/RewardModelWrapper/std": 6.648196416742661, "step": 3700 }, { "clip_ratio/high_max": 0.020421573969069868, "clip_ratio/high_mean": 0.020421573969069868, "clip_ratio/low_mean": 0.006358395353017841, "clip_ratio/low_min": 0.006358395353017841, "clip_ratio/region_mean": 0.02677996931830421, "completions/clipped_ratio": 0.966796875, "completions/max_length": 128.0, "completions/max_terminated_length": 85.4375, "completions/mean_length": 126.6142578125, "completions/mean_terminated_length": 75.74791765213013, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8347245409015025, "frac_reward_zero_std": 0.0, "grad_norm": 3.919222593307495, "kl": 1.4361850446462632, "learning_rate": 1.7912588208513545e-06, "loss": 0.4195, "num_tokens": 12201530.0, "reward": 4.583240419626236, "reward_std": 5.661596119403839, "rewards/RewardModelWrapper/mean": 4.583240419626236, "rewards/RewardModelWrapper/std": 6.355997741222382, "step": 3750 }, { "clip_ratio/high_max": 0.01899803020292893, "clip_ratio/high_mean": 0.01899803020292893, "clip_ratio/low_mean": 0.005853212493821047, "clip_ratio/low_min": 0.005853212493821047, "clip_ratio/region_mean": 0.02485124268569052, "completions/clipped_ratio": 0.9485294117647058, "completions/max_length": 128.0, "completions/max_terminated_length": 106.41176470588235, "completions/mean_length": 125.8529411764706, "completions/mean_terminated_length": 86.47465066348805, "completions/min_length": 63.11764705882353, "completions/min_terminated_length": 63.11764705882353, "epoch": 0.8458542014468559, "frac_reward_zero_std": 0.0, "grad_norm": 6.776216983795166, "kl": 1.4694108253717422, "learning_rate": 1.7741862053266563e-06, "loss": 0.4258, "num_tokens": 12367226.0, "reward": 4.691084188573501, "reward_std": 5.392301559448242, "rewards/RewardModelWrapper/mean": 4.691084188573501, "rewards/RewardModelWrapper/std": 6.076854313121123, "step": 3800 }, { "clip_ratio/high_max": 0.020900118886493145, "clip_ratio/high_mean": 0.020900118886493145, "clip_ratio/low_mean": 0.008081750934943557, "clip_ratio/low_min": 0.008081750934943557, "clip_ratio/region_mean": 0.028981869909912347, "completions/clipped_ratio": 0.9733455882352942, "completions/max_length": 128.0, "completions/max_terminated_length": 76.76470588235294, "completions/mean_length": 126.63051470588235, "completions/mean_terminated_length": 62.98235298605526, "completions/min_length": 81.41176470588235, "completions/min_terminated_length": 51.294117647058826, "epoch": 0.8569838619922092, "frac_reward_zero_std": 0.0, "grad_norm": 6.310102462768555, "kl": 1.3774869224429132, "learning_rate": 1.7571135898019578e-06, "loss": 0.398, "num_tokens": 12534040.0, "reward": 3.871620360542746, "reward_std": 5.696767147849588, "rewards/RewardModelWrapper/mean": 3.871620360542746, "rewards/RewardModelWrapper/std": 6.582426996792064, "step": 3850 }, { "clip_ratio/high_max": 0.021299479028675704, "clip_ratio/high_mean": 0.021299479028675704, "clip_ratio/low_mean": 0.0075305427008424885, "clip_ratio/low_min": 0.0075305427008424885, "clip_ratio/region_mean": 0.028830021731555463, "completions/clipped_ratio": 0.9619140625, "completions/max_length": 128.0, "completions/max_terminated_length": 85.8125, "completions/mean_length": 126.1943359375, "completions/mean_terminated_length": 71.39270901679993, "completions/min_length": 71.8125, "completions/min_terminated_length": 55.8125, "epoch": 0.8681135225375626, "frac_reward_zero_std": 0.0, "grad_norm": 4.930108547210693, "kl": 1.3843135032057763, "learning_rate": 1.7400409742772593e-06, "loss": 0.3964, "num_tokens": 12690615.0, "reward": 3.086591437458992, "reward_std": 6.208359390497208, "rewards/RewardModelWrapper/mean": 3.086591437458992, "rewards/RewardModelWrapper/std": 6.8491051197052, "step": 3900 }, { "clip_ratio/high_max": 0.018020967768970875, "clip_ratio/high_mean": 0.018020967768970875, "clip_ratio/low_mean": 0.006037966601434163, "clip_ratio/low_min": 0.006037966601434163, "clip_ratio/region_mean": 0.024058934384956956, "completions/clipped_ratio": 0.953125, "completions/max_length": 128.0, "completions/max_terminated_length": 103.76470588235294, "completions/mean_length": 126.11305147058823, "completions/mean_terminated_length": 84.38186331356273, "completions/min_length": 66.11764705882354, "completions/min_terminated_length": 58.588235294117645, "epoch": 0.8792431830829159, "frac_reward_zero_std": 0.0, "grad_norm": 4.74282169342041, "kl": 1.432164865732193, "learning_rate": 1.7229683587525609e-06, "loss": 0.4134, "num_tokens": 12857386.0, "reward": 3.5356551899629483, "reward_std": 5.877786804648006, "rewards/RewardModelWrapper/mean": 3.5356551899629483, "rewards/RewardModelWrapper/std": 6.742880484637092, "step": 3950 }, { "clip_ratio/high_max": 0.016392124033300207, "clip_ratio/high_mean": 0.016392124033300207, "clip_ratio/low_mean": 0.00735437709663529, "clip_ratio/low_min": 0.00735437709663529, "clip_ratio/region_mean": 0.02374650107929483, "completions/clipped_ratio": 0.9669117647058824, "completions/max_length": 128.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 126.86764705882354, "completions/mean_terminated_length": 82.9313735961914, "completions/min_length": 84.29411764705883, "completions/min_terminated_length": 69.23529411764706, "epoch": 0.8903728436282693, "frac_reward_zero_std": 0.0, "grad_norm": 5.2612786293029785, "kl": 1.4968037492036819, "learning_rate": 1.7058957432278626e-06, "loss": 0.4371, "num_tokens": 13025050.0, "reward": 3.9833039676441864, "reward_std": 5.820403575897217, "rewards/RewardModelWrapper/mean": 3.9833039676441864, "rewards/RewardModelWrapper/std": 6.59747979220222, "step": 4000 }, { "clip_ratio/high_max": 0.013542763022705913, "clip_ratio/high_mean": 0.013542763022705913, "clip_ratio/low_mean": 0.007844352710526437, "clip_ratio/low_min": 0.007844352710526437, "clip_ratio/region_mean": 0.021387115789111705, "completions/clipped_ratio": 0.9755859375, "completions/max_length": 128.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 127.052734375, "completions/mean_terminated_length": 69.609375, "completions/min_length": 89.8125, "completions/min_terminated_length": 57.8125, "epoch": 0.9015025041736227, "frac_reward_zero_std": 0.0, "grad_norm": 9.22656536102295, "kl": 1.486895147562027, "learning_rate": 1.6888231277031642e-06, "loss": 0.4339, "num_tokens": 13182896.0, "reward": 3.8604883551597595, "reward_std": 5.920006081461906, "rewards/RewardModelWrapper/mean": 3.8604883551597595, "rewards/RewardModelWrapper/std": 6.682152062654495, "step": 4050 }, { "clip_ratio/high_max": 0.017551230599638076, "clip_ratio/high_mean": 0.017551230599638076, "clip_ratio/low_mean": 0.006257881603378337, "clip_ratio/low_min": 0.006257881603378337, "clip_ratio/region_mean": 0.023809112217277287, "completions/clipped_ratio": 0.9604779411764706, "completions/max_length": 128.0, "completions/max_terminated_length": 103.47058823529412, "completions/mean_length": 126.76011029411765, "completions/mean_terminated_length": 90.72815165800206, "completions/min_length": 81.47058823529412, "completions/min_terminated_length": 73.94117647058823, "epoch": 0.9126321647189761, "frac_reward_zero_std": 0.0, "grad_norm": 4.342564105987549, "kl": 1.479159579873085, "learning_rate": 1.6717505121784657e-06, "loss": 0.434, "num_tokens": 13349539.0, "reward": 3.891232869204353, "reward_std": 5.906516776365392, "rewards/RewardModelWrapper/mean": 3.891232869204353, "rewards/RewardModelWrapper/std": 6.87522164513083, "step": 4100 }, { "clip_ratio/high_max": 0.017731820455519482, "clip_ratio/high_mean": 0.017731820455519482, "clip_ratio/low_mean": 0.0037902081329957583, "clip_ratio/low_min": 0.0037902081329957583, "clip_ratio/region_mean": 0.021522028532344847, "completions/clipped_ratio": 0.9632352941176471, "completions/max_length": 128.0, "completions/max_terminated_length": 102.11764705882354, "completions/mean_length": 126.3373161764706, "completions/mean_terminated_length": 80.36274584601907, "completions/min_length": 61.588235294117645, "completions/min_terminated_length": 54.05882352941177, "epoch": 0.9237618252643295, "frac_reward_zero_std": 0.0, "grad_norm": 8.095427513122559, "kl": 1.5771301573514938, "learning_rate": 1.6546778966537674e-06, "loss": 0.4627, "num_tokens": 13516058.0, "reward": 4.4532030890969665, "reward_std": 5.776824221891515, "rewards/RewardModelWrapper/mean": 4.4532030890969665, "rewards/RewardModelWrapper/std": 6.367258969475241, "step": 4150 }, { "clip_ratio/high_max": 0.018110398813150824, "clip_ratio/high_mean": 0.018110398813150824, "clip_ratio/low_mean": 0.006745649516233243, "clip_ratio/low_min": 0.006745649516233243, "clip_ratio/region_mean": 0.024856048391666264, "completions/clipped_ratio": 0.9609375, "completions/max_length": 128.0, "completions/max_terminated_length": 93.9375, "completions/mean_length": 126.3310546875, "completions/mean_terminated_length": 81.33363127708435, "completions/min_length": 74.75, "completions/min_terminated_length": 66.75, "epoch": 0.9348914858096828, "frac_reward_zero_std": 0.0, "grad_norm": 3.7098264694213867, "kl": 1.4337137299776077, "learning_rate": 1.637605281129069e-06, "loss": 0.4139, "num_tokens": 13673549.0, "reward": 3.717156395316124, "reward_std": 5.887754291296005, "rewards/RewardModelWrapper/mean": 3.717156395316124, "rewards/RewardModelWrapper/std": 6.542896807193756, "step": 4200 }, { "clip_ratio/high_max": 0.01609679988003336, "clip_ratio/high_mean": 0.01609679988003336, "clip_ratio/low_mean": 0.006251108425203711, "clip_ratio/low_min": 0.006251108425203711, "clip_ratio/region_mean": 0.022347908235387876, "completions/clipped_ratio": 0.9466911764705882, "completions/max_length": 128.0, "completions/max_terminated_length": 108.3529411764706, "completions/mean_length": 126.234375, "completions/mean_terminated_length": 89.88039308435776, "completions/min_length": 68.6470588235294, "completions/min_terminated_length": 61.11764705882353, "epoch": 0.9460211463550362, "frac_reward_zero_std": 0.0, "grad_norm": 4.461524963378906, "kl": 1.4435530692338943, "learning_rate": 1.6205326656043705e-06, "loss": 0.4174, "num_tokens": 13839820.0, "reward": 3.5667920813840976, "reward_std": 5.679576621336095, "rewards/RewardModelWrapper/mean": 3.5667920813840976, "rewards/RewardModelWrapper/std": 6.743907311383416, "step": 4250 }, { "clip_ratio/high_max": 0.01591621272964403, "clip_ratio/high_mean": 0.01591621272964403, "clip_ratio/low_mean": 0.005297647488187067, "clip_ratio/low_min": 0.005297647488187067, "clip_ratio/region_mean": 0.021213860225398094, "completions/clipped_ratio": 0.9669117647058824, "completions/max_length": 128.0, "completions/max_terminated_length": 100.76470588235294, "completions/mean_length": 126.63602941176471, "completions/mean_terminated_length": 87.36666780359604, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9571508069003896, "frac_reward_zero_std": 0.0, "grad_norm": 8.245085716247559, "kl": 1.4910035210847854, "learning_rate": 1.603460050079672e-06, "loss": 0.4312, "num_tokens": 14007144.0, "reward": 4.034057981827679, "reward_std": 5.743304505067713, "rewards/RewardModelWrapper/mean": 4.034057981827679, "rewards/RewardModelWrapper/std": 6.6319817094241875, "step": 4300 }, { "clip_ratio/high_max": 0.0162072420923505, "clip_ratio/high_mean": 0.0162072420923505, "clip_ratio/low_mean": 0.00646918074140558, "clip_ratio/low_min": 0.00646918074140558, "clip_ratio/region_mean": 0.022676422880031168, "completions/clipped_ratio": 0.9560546875, "completions/max_length": 128.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 126.1572265625, "completions/mean_terminated_length": 80.61093807220459, "completions/min_length": 69.625, "completions/min_terminated_length": 61.625, "epoch": 0.9682804674457429, "frac_reward_zero_std": 0.0, "grad_norm": 3.9271068572998047, "kl": 1.5564635121822357, "learning_rate": 1.5863874345549738e-06, "loss": 0.4481, "num_tokens": 14163497.0, "reward": 4.418118596076965, "reward_std": 5.663649529218674, "rewards/RewardModelWrapper/mean": 4.418118596076965, "rewards/RewardModelWrapper/std": 6.5488221347332, "step": 4350 }, { "clip_ratio/high_max": 0.015229720452334733, "clip_ratio/high_mean": 0.015229720452334733, "clip_ratio/low_mean": 0.005334880515874829, "clip_ratio/low_min": 0.005334880515874829, "clip_ratio/region_mean": 0.020564600981306285, "completions/clipped_ratio": 0.9411764705882353, "completions/max_length": 128.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 124.7876838235294, "completions/mean_terminated_length": 74.75882474113914, "completions/min_length": 50.35294117647059, "completions/min_terminated_length": 50.35294117647059, "epoch": 0.9794101279910963, "frac_reward_zero_std": 0.0, "grad_norm": 7.419699192047119, "kl": 1.4550988680124284, "learning_rate": 1.5693148190302755e-06, "loss": 0.4187, "num_tokens": 14328034.0, "reward": 3.752244500552907, "reward_std": 5.818949124392341, "rewards/RewardModelWrapper/mean": 3.752244500552907, "rewards/RewardModelWrapper/std": 6.797629524679745, "step": 4400 }, { "clip_ratio/high_max": 0.018021058345912024, "clip_ratio/high_mean": 0.018021058345912024, "clip_ratio/low_mean": 0.0030438171711284667, "clip_ratio/low_min": 0.0030438171711284667, "clip_ratio/region_mean": 0.021064875536831097, "completions/clipped_ratio": 0.9549632352941176, "completions/max_length": 128.0, "completions/max_terminated_length": 96.23529411764706, "completions/mean_length": 126.15900735294117, "completions/mean_terminated_length": 79.46218647676356, "completions/min_length": 68.05882352941177, "completions/min_terminated_length": 60.529411764705884, "epoch": 0.9905397885364496, "frac_reward_zero_std": 0.0, "grad_norm": 5.946506977081299, "kl": 1.4544246417284012, "learning_rate": 1.5522422035055773e-06, "loss": 0.4204, "num_tokens": 14494631.0, "reward": 3.687691057429594, "reward_std": 5.869795238270479, "rewards/RewardModelWrapper/mean": 3.687691057429594, "rewards/RewardModelWrapper/std": 6.839460316826315, "step": 4450 }, { "epoch": 0.9996661101836394, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.9438291139240507, "eval_completions/max_length": 128.0, "eval_completions/max_terminated_length": 53.129746835443036, "eval_completions/mean_length": 125.40446993670886, "eval_completions/mean_terminated_length": 48.271835556513146, "eval_completions/min_length": 96.05696202531645, "eval_completions/min_terminated_length": 43.39873417721519, "eval_frac_reward_zero_std": 0.0, "eval_kl": 1.4363023352019395, "eval_loss": 0.41118884086608887, "eval_num_tokens": 14622004.0, "eval_reward": 3.463206129738047, "eval_reward_std": 6.040495253722124, "eval_rewards/RewardModelWrapper/mean": 3.463206129738047, "eval_rewards/RewardModelWrapper/std": 6.557550964476187, "eval_runtime": 1430.6223, "eval_samples_per_second": 0.441, "eval_steps_per_second": 0.028, "step": 4491 } ], "logging_steps": 50, "max_steps": 8986, "num_input_tokens_seen": 14622004, "num_train_epochs": 2, "save_steps": 2696, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }