{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.032520325203252, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14397.0, "completions/max_terminated_length": 14397.0, "completions/mean_length": 10168.53125, "completions/mean_terminated_length": 10168.53125, "completions/min_length": 5260.0, "completions/min_terminated_length": 5260.0, "entropy": 0.2622494325041771, "epoch": 0.008130081300813009, "frac_reward_zero_std": 0.0, "grad_norm": 0.3732398450374603, "learning_rate": 1e-06, "loss": -0.0544, "num_tokens": 346273.0, "reward": 0.670829176902771, "reward_std": 0.5900986194610596, "rewards/reward_func/mean": 0.670829176902771, "rewards/reward_func/std": 0.5900986194610596, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9909619092941284, "sampling/importance_sampling_ratio/min": 0.061016567051410675, "sampling/sampling_logp_difference/max": 2.796609878540039, "sampling/sampling_logp_difference/mean": 0.018657810986042023, "step": 1, "step_time": 415.91905756667256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13889.0, "completions/max_terminated_length": 13889.0, "completions/mean_length": 8686.625, "completions/mean_terminated_length": 8686.625, "completions/min_length": 3608.0, "completions/min_terminated_length": 3608.0, "entropy": 0.30042469687759876, "epoch": 0.016260162601626018, "frac_reward_zero_std": 0.0, "grad_norm": 0.42703714966773987, "learning_rate": 9.959999999999999e-07, "loss": -0.017, "num_tokens": 637157.0, "reward": 0.3524366021156311, "reward_std": 0.5026865601539612, "rewards/reward_func/mean": 0.3524366021156311, "rewards/reward_func/std": 0.5026865601539612, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.337921380996704, "sampling/importance_sampling_ratio/mean": 0.9895248413085938, "sampling/importance_sampling_ratio/min": 0.2977469265460968, "sampling/sampling_logp_difference/max": 1.2115113735198975, "sampling/sampling_logp_difference/mean": 0.021018292754888535, "step": 2, "step_time": 367.433393279789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16789.0, "completions/max_terminated_length": 16789.0, "completions/mean_length": 8983.875, "completions/mean_terminated_length": 8983.875, "completions/min_length": 1247.0, "completions/min_terminated_length": 1247.0, "entropy": 0.28624483197927475, "epoch": 0.024390243902439025, "frac_reward_zero_std": 0.0, "grad_norm": 0.42274078726768494, "learning_rate": 9.92e-07, "loss": -0.01, "num_tokens": 942513.0, "reward": 1.4242839813232422, "reward_std": 2.6354684829711914, "rewards/reward_func/mean": 1.4273738861083984, "rewards/reward_func/std": 2.6337430477142334, "rewards/soft_overlong_punishment_reward/mean": -0.00308990478515625, "rewards/soft_overlong_punishment_reward/std": 0.017479142174124718, "sampling/importance_sampling_ratio/max": 1.82925283908844, "sampling/importance_sampling_ratio/mean": 0.9905841946601868, "sampling/importance_sampling_ratio/min": 0.010209716856479645, "sampling/sampling_logp_difference/max": 4.584415435791016, "sampling/sampling_logp_difference/mean": 0.019620321691036224, "step": 3, "step_time": 357.44491101126187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15431.0, "completions/max_terminated_length": 15431.0, "completions/mean_length": 10926.4375, "completions/mean_terminated_length": 10926.4375, "completions/min_length": 6775.0, "completions/min_terminated_length": 6775.0, "entropy": 0.26211014203727245, "epoch": 0.032520325203252036, "frac_reward_zero_std": 0.0, "grad_norm": 0.43384817242622375, "learning_rate": 9.88e-07, "loss": -0.0053, "num_tokens": 1312175.0, "reward": 0.4805724620819092, "reward_std": 0.5330014824867249, "rewards/reward_func/mean": 0.4805724620819092, "rewards/reward_func/std": 0.5330014824867249, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.227057933807373, "sampling/importance_sampling_ratio/mean": 0.990934431552887, "sampling/importance_sampling_ratio/min": 0.023468947038054466, "sampling/sampling_logp_difference/max": 3.752077102661133, "sampling/sampling_logp_difference/mean": 0.018750693649053574, "step": 4, "step_time": 371.1139560753945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16857.0, "completions/max_terminated_length": 16857.0, "completions/mean_length": 8058.5625, "completions/mean_terminated_length": 8058.5625, "completions/min_length": 3038.0, "completions/min_terminated_length": 3038.0, "entropy": 0.2765342816710472, "epoch": 0.04065040650406504, "frac_reward_zero_std": 0.0, "grad_norm": 0.43143269419670105, "learning_rate": 9.84e-07, "loss": 0.0181, "num_tokens": 1585833.0, "reward": 1.0776739120483398, "reward_std": 0.8259380459785461, "rewards/reward_func/mean": 1.081282615661621, "rewards/reward_func/std": 0.8210198283195496, "rewards/soft_overlong_punishment_reward/mean": -0.00360870361328125, "rewards/soft_overlong_punishment_reward/std": 0.02041390910744667, "sampling/importance_sampling_ratio/max": 2.1974010467529297, "sampling/importance_sampling_ratio/mean": 0.9910835027694702, "sampling/importance_sampling_ratio/min": 0.31773900985717773, "sampling/sampling_logp_difference/max": 1.1465249061584473, "sampling/sampling_logp_difference/mean": 0.018462253734469414, "step": 5, "step_time": 336.172940433491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15287.0, "completions/max_terminated_length": 15287.0, "completions/mean_length": 9501.75, "completions/mean_terminated_length": 9501.75, "completions/min_length": 4768.0, "completions/min_terminated_length": 4768.0, "entropy": 0.2606150833889842, "epoch": 0.04878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.367913156747818, "learning_rate": 9.8e-07, "loss": 0.0207, "num_tokens": 1917425.0, "reward": 0.7828472852706909, "reward_std": 0.7593443989753723, "rewards/reward_func/mean": 0.7828472852706909, "rewards/reward_func/std": 0.7593443989753723, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.830876588821411, "sampling/importance_sampling_ratio/mean": 0.9910170435905457, "sampling/importance_sampling_ratio/min": 0.05133574455976486, "sampling/sampling_logp_difference/max": 2.9693679809570312, "sampling/sampling_logp_difference/mean": 0.01832835003733635, "step": 6, "step_time": 453.13167459354736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16987.0, "completions/max_terminated_length": 16987.0, "completions/mean_length": 11730.90625, "completions/mean_terminated_length": 11730.90625, "completions/min_length": 6346.0, "completions/min_terminated_length": 6346.0, "entropy": 0.2580571649596095, "epoch": 0.056910569105691054, "frac_reward_zero_std": 0.0, "grad_norm": 0.3932175934314728, "learning_rate": 9.759999999999998e-07, "loss": 0.0595, "num_tokens": 2308110.0, "reward": 0.4529891014099121, "reward_std": 0.5422401428222656, "rewards/reward_func/mean": 0.4602828025817871, "rewards/reward_func/std": 0.5356888175010681, "rewards/soft_overlong_punishment_reward/mean": -0.007293701171875, "rewards/soft_overlong_punishment_reward/std": 0.029728731140494347, "sampling/importance_sampling_ratio/max": 2.425518274307251, "sampling/importance_sampling_ratio/mean": 0.9910261034965515, "sampling/importance_sampling_ratio/min": 0.00816231407225132, "sampling/sampling_logp_difference/max": 4.8082275390625, "sampling/sampling_logp_difference/mean": 0.018817249685525894, "step": 7, "step_time": 420.06239356310107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13961.0, "completions/max_terminated_length": 13961.0, "completions/mean_length": 10841.65625, "completions/mean_terminated_length": 10841.65625, "completions/min_length": 5514.0, "completions/min_terminated_length": 5514.0, "entropy": 0.2597746094688773, "epoch": 0.06504065040650407, "frac_reward_zero_std": 0.25, "grad_norm": 0.3083711266517639, "learning_rate": 9.72e-07, "loss": 0.062, "num_tokens": 2676595.0, "reward": 0.3495897352695465, "reward_std": 0.7178114652633667, "rewards/reward_func/mean": 0.3495897352695465, "rewards/reward_func/std": 0.7178115248680115, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9910988807678223, "sampling/importance_sampling_ratio/min": 0.1719096601009369, "sampling/sampling_logp_difference/max": 1.7607861757278442, "sampling/sampling_logp_difference/mean": 0.01827491819858551, "step": 8, "step_time": 413.52012689388357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16061.0, "completions/max_terminated_length": 16061.0, "completions/mean_length": 11138.6875, "completions/mean_terminated_length": 11138.6875, "completions/min_length": 5675.0, "completions/min_terminated_length": 5675.0, "entropy": 0.2899176850914955, "epoch": 0.07317073170731707, "frac_reward_zero_std": 0.0, "grad_norm": 0.36722126603126526, "learning_rate": 9.679999999999999e-07, "loss": 0.0229, "num_tokens": 3048729.0, "reward": 0.39484167098999023, "reward_std": 0.5577594637870789, "rewards/reward_func/mean": 0.39484167098999023, "rewards/reward_func/std": 0.5577594637870789, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7904616594314575, "sampling/importance_sampling_ratio/mean": 0.989971399307251, "sampling/importance_sampling_ratio/min": 0.26559245586395264, "sampling/sampling_logp_difference/max": 1.3257923126220703, "sampling/sampling_logp_difference/mean": 0.020297091454267502, "step": 9, "step_time": 361.87901354860514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15494.0, "completions/max_terminated_length": 15494.0, "completions/mean_length": 11128.1875, "completions/mean_terminated_length": 11128.1875, "completions/min_length": 8007.0, "completions/min_terminated_length": 8007.0, "entropy": 0.289524232968688, "epoch": 0.08130081300813008, "frac_reward_zero_std": 0.0, "grad_norm": 0.3917076289653778, "learning_rate": 9.64e-07, "loss": -0.0053, "num_tokens": 3421799.0, "reward": 1.557268738746643, "reward_std": 4.395501613616943, "rewards/reward_func/mean": 1.557268738746643, "rewards/reward_func/std": 4.395502090454102, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9057838916778564, "sampling/importance_sampling_ratio/mean": 0.9899657964706421, "sampling/importance_sampling_ratio/min": 0.00011426152923377231, "sampling/sampling_logp_difference/max": 9.077020645141602, "sampling/sampling_logp_difference/mean": 0.02046096697449684, "step": 10, "step_time": 530.6975831072778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16418.0, "completions/max_terminated_length": 16418.0, "completions/mean_length": 11667.96875, "completions/mean_terminated_length": 11667.96875, "completions/min_length": 8577.0, "completions/min_terminated_length": 8577.0, "entropy": 0.25299404561519623, "epoch": 0.08943089430894309, "frac_reward_zero_std": 0.0, "grad_norm": 7.989307880401611, "learning_rate": 9.6e-07, "loss": 0.0268, "num_tokens": 3815326.0, "reward": 0.4127262830734253, "reward_std": 0.5275108814239502, "rewards/reward_func/mean": 0.4129856824874878, "rewards/reward_func/std": 0.5273244976997375, "rewards/soft_overlong_punishment_reward/mean": -0.0002593994140625, "rewards/soft_overlong_punishment_reward/std": 0.0014673846308141947, "sampling/importance_sampling_ratio/max": 2.6564552783966064, "sampling/importance_sampling_ratio/mean": 0.9911899566650391, "sampling/importance_sampling_ratio/min": 0.042087722569704056, "sampling/sampling_logp_difference/max": 3.167999267578125, "sampling/sampling_logp_difference/mean": 0.01853198930621147, "step": 11, "step_time": 455.64031926658936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15432.0, "completions/max_terminated_length": 15432.0, "completions/mean_length": 10867.875, "completions/mean_terminated_length": 10867.875, "completions/min_length": 7317.0, "completions/min_terminated_length": 7317.0, "entropy": 0.26554491464048624, "epoch": 0.0975609756097561, "frac_reward_zero_std": 0.0, "grad_norm": 0.35505810379981995, "learning_rate": 9.559999999999998e-07, "loss": 0.0226, "num_tokens": 4184850.0, "reward": 0.47400495409965515, "reward_std": 0.48420941829681396, "rewards/reward_func/mean": 0.47400495409965515, "rewards/reward_func/std": 0.4842093884944916, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1569271087646484, "sampling/importance_sampling_ratio/mean": 0.9906758069992065, "sampling/importance_sampling_ratio/min": 0.01572277396917343, "sampling/sampling_logp_difference/max": 4.152645111083984, "sampling/sampling_logp_difference/mean": 0.019163597375154495, "step": 12, "step_time": 359.7980523931328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19330.0, "completions/max_terminated_length": 19330.0, "completions/mean_length": 11394.46875, "completions/mean_terminated_length": 11394.46875, "completions/min_length": 6975.0, "completions/min_terminated_length": 6975.0, "entropy": 0.26600847486406565, "epoch": 0.10569105691056911, "frac_reward_zero_std": 0.0, "grad_norm": 0.3638916611671448, "learning_rate": 9.52e-07, "loss": 0.0105, "num_tokens": 4567681.0, "reward": 0.6445801258087158, "reward_std": 0.9351080656051636, "rewards/reward_func/mean": 0.6834137439727783, "rewards/reward_func/std": 0.8933014273643494, "rewards/soft_overlong_punishment_reward/mean": -0.0388336181640625, "rewards/soft_overlong_punishment_reward/std": 0.15481862425804138, "sampling/importance_sampling_ratio/max": 2.923246383666992, "sampling/importance_sampling_ratio/mean": 0.9907053112983704, "sampling/importance_sampling_ratio/min": 0.09609930962324142, "sampling/sampling_logp_difference/max": 2.3423731327056885, "sampling/sampling_logp_difference/mean": 0.0187990739941597, "step": 13, "step_time": 594.1962707811035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12979.0, "completions/max_terminated_length": 12979.0, "completions/mean_length": 7971.6875, "completions/mean_terminated_length": 7971.6875, "completions/min_length": 3552.0, "completions/min_terminated_length": 3552.0, "entropy": 0.3081512898206711, "epoch": 0.11382113821138211, "frac_reward_zero_std": 0.0, "grad_norm": 0.46477997303009033, "learning_rate": 9.479999999999999e-07, "loss": 0.0457, "num_tokens": 4830719.0, "reward": 0.9467858076095581, "reward_std": 0.8943569660186768, "rewards/reward_func/mean": 0.9467858076095581, "rewards/reward_func/std": 0.8943569660186768, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9614217281341553, "sampling/importance_sampling_ratio/mean": 0.989205002784729, "sampling/importance_sampling_ratio/min": 0.0001446620444767177, "sampling/sampling_logp_difference/max": 8.841110229492188, "sampling/sampling_logp_difference/mean": 0.021643634885549545, "step": 14, "step_time": 289.7629952353891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16237.0, "completions/max_terminated_length": 16237.0, "completions/mean_length": 10616.90625, "completions/mean_terminated_length": 10616.90625, "completions/min_length": 7285.0, "completions/min_terminated_length": 7285.0, "entropy": 0.293476989492774, "epoch": 0.12195121951219512, "frac_reward_zero_std": 0.25, "grad_norm": 0.3079313635826111, "learning_rate": 9.439999999999999e-07, "loss": -0.0016, "num_tokens": 5184964.0, "reward": 0.4604455828666687, "reward_std": 0.6293958425521851, "rewards/reward_func/mean": 0.4604455828666687, "rewards/reward_func/std": 0.6293958425521851, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9898009300231934, "sampling/importance_sampling_ratio/min": 0.18817251920700073, "sampling/sampling_logp_difference/max": 1.670396089553833, "sampling/sampling_logp_difference/mean": 0.020975295454263687, "step": 15, "step_time": 446.5528327494394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13793.0, "completions/max_terminated_length": 13793.0, "completions/mean_length": 10209.96875, "completions/mean_terminated_length": 10209.96875, "completions/min_length": 7867.0, "completions/min_terminated_length": 7867.0, "entropy": 0.26629081927239895, "epoch": 0.13008130081300814, "frac_reward_zero_std": 0.25, "grad_norm": 0.375059574842453, "learning_rate": 9.399999999999999e-07, "loss": -0.0079, "num_tokens": 5530355.0, "reward": 0.5800999999046326, "reward_std": 0.6380655169487, "rewards/reward_func/mean": 0.5800999999046326, "rewards/reward_func/std": 0.6380655765533447, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9908578991889954, "sampling/importance_sampling_ratio/min": 0.24528150260448456, "sampling/sampling_logp_difference/max": 1.405348777770996, "sampling/sampling_logp_difference/mean": 0.019185448065400124, "step": 16, "step_time": 322.7766472310759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14345.0, "completions/max_terminated_length": 14345.0, "completions/mean_length": 10214.875, "completions/mean_terminated_length": 10214.875, "completions/min_length": 8254.0, "completions/min_terminated_length": 8254.0, "entropy": 0.28936258889734745, "epoch": 0.13821138211382114, "frac_reward_zero_std": 0.0, "grad_norm": 0.39189663529396057, "learning_rate": 9.36e-07, "loss": -0.01, "num_tokens": 5868399.0, "reward": 0.7184683680534363, "reward_std": 1.3543201684951782, "rewards/reward_func/mean": 0.7184683680534363, "rewards/reward_func/std": 1.3543201684951782, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9899466037750244, "sampling/importance_sampling_ratio/min": 0.14265097677707672, "sampling/sampling_logp_difference/max": 1.9473543167114258, "sampling/sampling_logp_difference/mean": 0.02051045559346676, "step": 17, "step_time": 332.0521780475974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15805.0, "completions/max_terminated_length": 15805.0, "completions/mean_length": 12219.53125, "completions/mean_terminated_length": 12219.53125, "completions/min_length": 6227.0, "completions/min_terminated_length": 6227.0, "entropy": 0.24262672010809183, "epoch": 0.14634146341463414, "frac_reward_zero_std": 0.0, "grad_norm": 0.36067014932632446, "learning_rate": 9.32e-07, "loss": -0.0058, "num_tokens": 6284928.0, "reward": 0.4509197473526001, "reward_std": 0.5910488963127136, "rewards/reward_func/mean": 0.4509197473526001, "rewards/reward_func/std": 0.5910489559173584, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.732221841812134, "sampling/importance_sampling_ratio/mean": 0.9915755987167358, "sampling/importance_sampling_ratio/min": 0.08208515495061874, "sampling/sampling_logp_difference/max": 2.499998092651367, "sampling/sampling_logp_difference/mean": 0.017901983112096786, "step": 18, "step_time": 481.0771376036573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14306.0, "completions/max_terminated_length": 14306.0, "completions/mean_length": 11337.0625, "completions/mean_terminated_length": 11337.0625, "completions/min_length": 7568.0, "completions/min_terminated_length": 7568.0, "entropy": 0.2646152526140213, "epoch": 0.15447154471544716, "frac_reward_zero_std": 0.0, "grad_norm": 0.36779990792274475, "learning_rate": 9.28e-07, "loss": -0.0053, "num_tokens": 6675994.0, "reward": 0.4712689220905304, "reward_std": 0.6008197665214539, "rewards/reward_func/mean": 0.4712689220905304, "rewards/reward_func/std": 0.6008197069168091, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.159226179122925, "sampling/importance_sampling_ratio/mean": 0.990673303604126, "sampling/importance_sampling_ratio/min": 0.13695114850997925, "sampling/sampling_logp_difference/max": 1.988131046295166, "sampling/sampling_logp_difference/mean": 0.019125521183013916, "step": 19, "step_time": 353.8724719102029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14549.0, "completions/max_terminated_length": 14549.0, "completions/mean_length": 10831.9375, "completions/mean_terminated_length": 10831.9375, "completions/min_length": 7563.0, "completions/min_terminated_length": 7563.0, "entropy": 0.2252928400412202, "epoch": 0.16260162601626016, "frac_reward_zero_std": 0.0, "grad_norm": 0.3673231899738312, "learning_rate": 9.24e-07, "loss": 0.0035, "num_tokens": 7065000.0, "reward": 0.3244926333427429, "reward_std": 0.3530224561691284, "rewards/reward_func/mean": 0.3244926333427429, "rewards/reward_func/std": 0.3530224561691284, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0267248153686523, "sampling/importance_sampling_ratio/mean": 0.9923210740089417, "sampling/importance_sampling_ratio/min": 0.01011732593178749, "sampling/sampling_logp_difference/max": 4.593505859375, "sampling/sampling_logp_difference/mean": 0.016302792355418205, "step": 20, "step_time": 574.1707554678433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16370.0, "completions/max_terminated_length": 16370.0, "completions/mean_length": 10895.40625, "completions/mean_terminated_length": 10895.40625, "completions/min_length": 6243.0, "completions/min_terminated_length": 6243.0, "entropy": 0.28782651759684086, "epoch": 0.17073170731707318, "frac_reward_zero_std": 0.0, "grad_norm": 0.3720740079879761, "learning_rate": 9.2e-07, "loss": 0.0136, "num_tokens": 7431013.0, "reward": 0.9185539484024048, "reward_std": 0.8305948376655579, "rewards/reward_func/mean": 0.9185539484024048, "rewards/reward_func/std": 0.8305947780609131, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.346141815185547, "sampling/importance_sampling_ratio/mean": 0.9899004697799683, "sampling/importance_sampling_ratio/min": 0.1588689088821411, "sampling/sampling_logp_difference/max": 1.8396759033203125, "sampling/sampling_logp_difference/mean": 0.020509924739599228, "step": 21, "step_time": 422.31716467044316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13981.0, "completions/max_terminated_length": 13981.0, "completions/mean_length": 10807.5625, "completions/mean_terminated_length": 10807.5625, "completions/min_length": 8205.0, "completions/min_terminated_length": 8205.0, "entropy": 0.24795905128121376, "epoch": 0.17886178861788618, "frac_reward_zero_std": 0.0, "grad_norm": 0.3784899115562439, "learning_rate": 9.16e-07, "loss": 0.002, "num_tokens": 7802807.0, "reward": 0.5439961552619934, "reward_std": 0.6093953251838684, "rewards/reward_func/mean": 0.5439961552619934, "rewards/reward_func/std": 0.6093953251838684, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9913806915283203, "sampling/importance_sampling_ratio/min": 0.07173754274845123, "sampling/sampling_logp_difference/max": 2.6347410678863525, "sampling/sampling_logp_difference/mean": 0.01798321306705475, "step": 22, "step_time": 554.0859813888092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15253.0, "completions/max_terminated_length": 15253.0, "completions/mean_length": 10385.09375, "completions/mean_terminated_length": 10385.09375, "completions/min_length": 6117.0, "completions/min_terminated_length": 6117.0, "entropy": 0.2895797435194254, "epoch": 0.18699186991869918, "frac_reward_zero_std": 0.0, "grad_norm": 0.3935360014438629, "learning_rate": 9.12e-07, "loss": 0.0286, "num_tokens": 8154338.0, "reward": 0.690646767616272, "reward_std": 0.689028263092041, "rewards/reward_func/mean": 0.690646767616272, "rewards/reward_func/std": 0.6890282034873962, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2023396492004395, "sampling/importance_sampling_ratio/mean": 0.9899480938911438, "sampling/importance_sampling_ratio/min": 0.07151451706886292, "sampling/sampling_logp_difference/max": 2.637854814529419, "sampling/sampling_logp_difference/mean": 0.020658794790506363, "step": 23, "step_time": 574.1298945220187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15523.0, "completions/max_terminated_length": 15523.0, "completions/mean_length": 9569.84375, "completions/mean_terminated_length": 9569.84375, "completions/min_length": 3304.0, "completions/min_terminated_length": 3304.0, "entropy": 0.30353074334561825, "epoch": 0.1951219512195122, "frac_reward_zero_std": 0.0, "grad_norm": 0.41558295488357544, "learning_rate": 9.08e-07, "loss": -0.0139, "num_tokens": 8474285.0, "reward": 0.6444994807243347, "reward_std": 0.611183762550354, "rewards/reward_func/mean": 0.6444994807243347, "rewards/reward_func/std": 0.611183762550354, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9894601702690125, "sampling/importance_sampling_ratio/min": 0.27406805753707886, "sampling/sampling_logp_difference/max": 1.6113767623901367, "sampling/sampling_logp_difference/mean": 0.021533746272325516, "step": 24, "step_time": 392.5540506092366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17367.0, "completions/max_terminated_length": 17367.0, "completions/mean_length": 10313.59375, "completions/mean_terminated_length": 10313.59375, "completions/min_length": 5453.0, "completions/min_terminated_length": 5453.0, "entropy": 0.28294676542282104, "epoch": 0.2032520325203252, "frac_reward_zero_std": 0.0, "grad_norm": 0.37816983461380005, "learning_rate": 9.039999999999999e-07, "loss": -0.0331, "num_tokens": 8821040.0, "reward": 0.7799074649810791, "reward_std": 0.7231826782226562, "rewards/reward_func/mean": 0.7874071598052979, "rewards/reward_func/std": 0.7279127836227417, "rewards/soft_overlong_punishment_reward/mean": -0.00749969482421875, "rewards/soft_overlong_punishment_reward/std": 0.042424678802490234, "sampling/importance_sampling_ratio/max": 2.6919732093811035, "sampling/importance_sampling_ratio/mean": 0.99010169506073, "sampling/importance_sampling_ratio/min": 0.24937866628170013, "sampling/sampling_logp_difference/max": 1.3887828588485718, "sampling/sampling_logp_difference/mean": 0.020352153107523918, "step": 25, "step_time": 420.245932768099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 20480.0, "completions/max_terminated_length": 13815.0, "completions/mean_length": 10411.9375, "completions/mean_terminated_length": 10087.1611328125, "completions/min_length": 7108.0, "completions/min_terminated_length": 7108.0, "entropy": 0.25935694947838783, "epoch": 0.21138211382113822, "frac_reward_zero_std": 0.0, "grad_norm": 0.34230929613113403, "learning_rate": 9e-07, "loss": -0.0888, "num_tokens": 9173350.0, "reward": 0.48251837491989136, "reward_std": 0.5877281427383423, "rewards/reward_func/mean": 0.5303415656089783, "rewards/reward_func/std": 0.5303896069526672, "rewards/soft_overlong_punishment_reward/mean": -0.03125, "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.9357402324676514, "sampling/importance_sampling_ratio/mean": 0.9906986951828003, "sampling/importance_sampling_ratio/min": 0.20663174986839294, "sampling/sampling_logp_difference/max": 1.5768170356750488, "sampling/sampling_logp_difference/mean": 0.01940227299928665, "step": 26, "step_time": 618.8288546381518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 20480.0, "completions/max_terminated_length": 19929.0, "completions/mean_length": 12748.8125, "completions/mean_terminated_length": 12499.4189453125, "completions/min_length": 8056.0, "completions/min_terminated_length": 8056.0, "entropy": 0.2515546875074506, "epoch": 0.21951219512195122, "frac_reward_zero_std": 0.0, "grad_norm": 0.3102463185787201, "learning_rate": 8.96e-07, "loss": 0.0175, "num_tokens": 9609168.0, "reward": 0.390100359916687, "reward_std": 0.6925687789916992, "rewards/reward_func/mean": 0.4781709611415863, "rewards/reward_func/std": 0.6022319793701172, "rewards/soft_overlong_punishment_reward/mean": -0.07312774658203125, "rewards/soft_overlong_punishment_reward/std": 0.23431003093719482, "sampling/importance_sampling_ratio/max": 2.518101215362549, "sampling/importance_sampling_ratio/mean": 0.9911104440689087, "sampling/importance_sampling_ratio/min": 0.22955988347530365, "sampling/sampling_logp_difference/max": 1.4715913534164429, "sampling/sampling_logp_difference/mean": 0.018376227468252182, "step": 27, "step_time": 459.3379825237207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13431.0, "completions/max_terminated_length": 13431.0, "completions/mean_length": 8919.34375, "completions/mean_terminated_length": 8919.34375, "completions/min_length": 5918.0, "completions/min_terminated_length": 5918.0, "entropy": 0.30696201138198376, "epoch": 0.22764227642276422, "frac_reward_zero_std": 0.0, "grad_norm": 0.4097852408885956, "learning_rate": 8.92e-07, "loss": 0.0214, "num_tokens": 9904035.0, "reward": 0.961408257484436, "reward_std": 0.7320297360420227, "rewards/reward_func/mean": 0.961408257484436, "rewards/reward_func/std": 0.7320297360420227, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4255101680755615, "sampling/importance_sampling_ratio/mean": 0.9893720149993896, "sampling/importance_sampling_ratio/min": 0.06206132099032402, "sampling/sampling_logp_difference/max": 2.779632329940796, "sampling/sampling_logp_difference/mean": 0.02153138443827629, "step": 28, "step_time": 289.10520208696835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16485.0, "completions/max_terminated_length": 16485.0, "completions/mean_length": 11523.40625, "completions/mean_terminated_length": 11523.40625, "completions/min_length": 7697.0, "completions/min_terminated_length": 7697.0, "entropy": 0.278985645622015, "epoch": 0.23577235772357724, "frac_reward_zero_std": 0.0, "grad_norm": 0.3535861670970917, "learning_rate": 8.88e-07, "loss": 0.0286, "num_tokens": 10295952.0, "reward": 0.4712695777416229, "reward_std": 0.5820122361183167, "rewards/reward_func/mean": 0.4720401465892792, "rewards/reward_func/std": 0.5814188122749329, "rewards/soft_overlong_punishment_reward/mean": -0.00077056884765625, "rewards/soft_overlong_punishment_reward/std": 0.004358995705842972, "sampling/importance_sampling_ratio/max": 2.9528937339782715, "sampling/importance_sampling_ratio/mean": 0.9902883768081665, "sampling/importance_sampling_ratio/min": 0.06855468451976776, "sampling/sampling_logp_difference/max": 2.6801235675811768, "sampling/sampling_logp_difference/mean": 0.0203237347304821, "step": 29, "step_time": 426.5832918223459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15061.0, "completions/max_terminated_length": 15061.0, "completions/mean_length": 9198.8125, "completions/mean_terminated_length": 9198.8125, "completions/min_length": 3636.0, "completions/min_terminated_length": 3636.0, "entropy": 0.3061511926352978, "epoch": 0.24390243902439024, "frac_reward_zero_std": 0.0, "grad_norm": 0.4075096547603607, "learning_rate": 8.839999999999999e-07, "loss": 0.0272, "num_tokens": 10603946.0, "reward": 0.4550362825393677, "reward_std": 0.4571691155433655, "rewards/reward_func/mean": 0.4550362825393677, "rewards/reward_func/std": 0.4571691155433655, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.989551305770874, "sampling/importance_sampling_ratio/min": 0.4050315022468567, "sampling/sampling_logp_difference/max": 1.1088628768920898, "sampling/sampling_logp_difference/mean": 0.021421968936920166, "step": 30, "step_time": 363.5614477007184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15959.0, "completions/max_terminated_length": 15959.0, "completions/mean_length": 11325.1875, "completions/mean_terminated_length": 11325.1875, "completions/min_length": 8080.0, "completions/min_terminated_length": 8080.0, "entropy": 0.28197295404970646, "epoch": 0.25203252032520324, "frac_reward_zero_std": 0.0, "grad_norm": 0.3684118986129761, "learning_rate": 8.799999999999999e-07, "loss": 0.0276, "num_tokens": 10989504.0, "reward": 0.864295244216919, "reward_std": 1.0850777626037598, "rewards/reward_func/mean": 0.864295244216919, "rewards/reward_func/std": 1.0850777626037598, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901672601699829, "sampling/importance_sampling_ratio/min": 0.0023719461169093847, "sampling/sampling_logp_difference/max": 6.044044494628906, "sampling/sampling_logp_difference/mean": 0.020149797201156616, "step": 31, "step_time": 429.3172168934252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17952.0, "completions/max_terminated_length": 17952.0, "completions/mean_length": 10753.03125, "completions/mean_terminated_length": 10753.03125, "completions/min_length": 6510.0, "completions/min_terminated_length": 6510.0, "entropy": 0.2921921294182539, "epoch": 0.2601626016260163, "frac_reward_zero_std": 0.25, "grad_norm": 0.3402082920074463, "learning_rate": 8.76e-07, "loss": 0.0616, "num_tokens": 11344297.0, "reward": 1.0129244327545166, "reward_std": 0.6747909784317017, "rewards/reward_func/mean": 1.0248873233795166, "rewards/reward_func/std": 0.6532120108604431, "rewards/soft_overlong_punishment_reward/mean": -0.011962890625, "rewards/soft_overlong_punishment_reward/std": 0.06767232716083527, "sampling/importance_sampling_ratio/max": 2.011244058609009, "sampling/importance_sampling_ratio/mean": 0.990020751953125, "sampling/importance_sampling_ratio/min": 0.2620807886123657, "sampling/sampling_logp_difference/max": 1.3391025066375732, "sampling/sampling_logp_difference/mean": 0.020625203847885132, "step": 32, "step_time": 405.72090286947787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17144.0, "completions/max_terminated_length": 17144.0, "completions/mean_length": 10907.46875, "completions/mean_terminated_length": 10907.46875, "completions/min_length": 4381.0, "completions/min_terminated_length": 4381.0, "entropy": 0.2899629846215248, "epoch": 0.2682926829268293, "frac_reward_zero_std": 0.0, "grad_norm": 0.38532310724258423, "learning_rate": 8.72e-07, "loss": 0.0188, "num_tokens": 11715560.0, "reward": 0.5839906930923462, "reward_std": 0.715369462966919, "rewards/reward_func/mean": 0.5969988107681274, "rewards/reward_func/std": 0.7036058902740479, "rewards/soft_overlong_punishment_reward/mean": -0.01300811767578125, "rewards/soft_overlong_punishment_reward/std": 0.0447469986975193, "sampling/importance_sampling_ratio/max": 2.857685089111328, "sampling/importance_sampling_ratio/mean": 0.9903793931007385, "sampling/importance_sampling_ratio/min": 0.16617049276828766, "sampling/sampling_logp_difference/max": 1.794740915298462, "sampling/sampling_logp_difference/mean": 0.01988939195871353, "step": 33, "step_time": 451.89993859338574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15459.0, "completions/max_terminated_length": 15459.0, "completions/mean_length": 10224.9375, "completions/mean_terminated_length": 10224.9375, "completions/min_length": 6120.0, "completions/min_terminated_length": 6120.0, "entropy": 0.29201703891158104, "epoch": 0.2764227642276423, "frac_reward_zero_std": 0.0, "grad_norm": 0.4802566170692444, "learning_rate": 8.68e-07, "loss": -0.0018, "num_tokens": 12058662.0, "reward": 0.6011689901351929, "reward_std": 0.6944944262504578, "rewards/reward_func/mean": 0.6011689901351929, "rewards/reward_func/std": 0.6944944262504578, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0942885875701904, "sampling/importance_sampling_ratio/mean": 0.9899111390113831, "sampling/importance_sampling_ratio/min": 0.048549018800258636, "sampling/sampling_logp_difference/max": 3.025181293487549, "sampling/sampling_logp_difference/mean": 0.02062905579805374, "step": 34, "step_time": 560.5957993268967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14074.0, "completions/max_terminated_length": 14074.0, "completions/mean_length": 9658.96875, "completions/mean_terminated_length": 9658.96875, "completions/min_length": 5878.0, "completions/min_terminated_length": 5878.0, "entropy": 0.28341494500637054, "epoch": 0.2845528455284553, "frac_reward_zero_std": 0.0, "grad_norm": 0.46002474427223206, "learning_rate": 8.639999999999999e-07, "loss": 0.0129, "num_tokens": 12387477.0, "reward": 1.293872594833374, "reward_std": 1.2786844968795776, "rewards/reward_func/mean": 1.293872594833374, "rewards/reward_func/std": 1.2786844968795776, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6462442874908447, "sampling/importance_sampling_ratio/mean": 0.9899374842643738, "sampling/importance_sampling_ratio/min": 0.004488667007535696, "sampling/sampling_logp_difference/max": 5.4061994552612305, "sampling/sampling_logp_difference/mean": 0.02022422105073929, "step": 35, "step_time": 324.04454420367256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16723.0, "completions/max_terminated_length": 16723.0, "completions/mean_length": 10310.90625, "completions/mean_terminated_length": 10310.90625, "completions/min_length": 6380.0, "completions/min_terminated_length": 6380.0, "entropy": 0.26594763714820147, "epoch": 0.2926829268292683, "frac_reward_zero_std": 0.0, "grad_norm": 0.3799055516719818, "learning_rate": 8.599999999999999e-07, "loss": 0.0193, "num_tokens": 12743762.0, "reward": 1.0520066022872925, "reward_std": 1.5496177673339844, "rewards/reward_func/mean": 1.0545929670333862, "rewards/reward_func/std": 1.5477306842803955, "rewards/soft_overlong_punishment_reward/mean": -0.00258636474609375, "rewards/soft_overlong_punishment_reward/std": 0.014630688354372978, "sampling/importance_sampling_ratio/max": 2.5548853874206543, "sampling/importance_sampling_ratio/mean": 0.9906677007675171, "sampling/importance_sampling_ratio/min": 0.34483057260513306, "sampling/sampling_logp_difference/max": 1.064702033996582, "sampling/sampling_logp_difference/mean": 0.018773481249809265, "step": 36, "step_time": 596.4897175964434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14483.0, "completions/max_terminated_length": 14483.0, "completions/mean_length": 8411.0, "completions/mean_terminated_length": 8411.0, "completions/min_length": 2507.0, "completions/min_terminated_length": 2507.0, "entropy": 0.28924608789384365, "epoch": 0.3008130081300813, "frac_reward_zero_std": 0.0, "grad_norm": 0.44713178277015686, "learning_rate": 8.559999999999999e-07, "loss": -0.0091, "num_tokens": 13027682.0, "reward": 0.9169277548789978, "reward_std": 0.6975835561752319, "rewards/reward_func/mean": 0.9169277548789978, "rewards/reward_func/std": 0.6975834965705872, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9900239706039429, "sampling/importance_sampling_ratio/min": 0.2362688183784485, "sampling/sampling_logp_difference/max": 1.4427850246429443, "sampling/sampling_logp_difference/mean": 0.020912623032927513, "step": 37, "step_time": 514.7615225652698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14294.0, "completions/max_terminated_length": 14294.0, "completions/mean_length": 10098.125, "completions/mean_terminated_length": 10098.125, "completions/min_length": 5654.0, "completions/min_terminated_length": 5654.0, "entropy": 0.2681300761178136, "epoch": 0.3089430894308943, "frac_reward_zero_std": 0.0, "grad_norm": 0.3977094292640686, "learning_rate": 8.52e-07, "loss": 0.0076, "num_tokens": 13373614.0, "reward": 0.3022610545158386, "reward_std": 0.37053707242012024, "rewards/reward_func/mean": 0.3022610545158386, "rewards/reward_func/std": 0.37053707242012024, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9909188747406006, "sampling/importance_sampling_ratio/min": 0.13540859520435333, "sampling/sampling_logp_difference/max": 1.9994584321975708, "sampling/sampling_logp_difference/mean": 0.018634535372257233, "step": 38, "step_time": 455.13201168039814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15760.0, "completions/max_terminated_length": 15760.0, "completions/mean_length": 11131.75, "completions/mean_terminated_length": 11131.75, "completions/min_length": 8044.0, "completions/min_terminated_length": 8044.0, "entropy": 0.2536044828593731, "epoch": 0.3170731707317073, "frac_reward_zero_std": 0.0, "grad_norm": 0.40394505858421326, "learning_rate": 8.48e-07, "loss": 0.0142, "num_tokens": 13752318.0, "reward": 0.5811458230018616, "reward_std": 0.6193008422851562, "rewards/reward_func/mean": 0.5811458230018616, "rewards/reward_func/std": 0.6193007826805115, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9912146329879761, "sampling/importance_sampling_ratio/min": 0.02844265289604664, "sampling/sampling_logp_difference/max": 3.5598654747009277, "sampling/sampling_logp_difference/mean": 0.018120869994163513, "step": 39, "step_time": 425.23402834986337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14644.0, "completions/max_terminated_length": 14644.0, "completions/mean_length": 7563.40625, "completions/mean_terminated_length": 7563.40625, "completions/min_length": 2451.0, "completions/min_terminated_length": 2451.0, "entropy": 0.30642482824623585, "epoch": 0.3252032520325203, "frac_reward_zero_std": 0.25, "grad_norm": 0.4431585669517517, "learning_rate": 8.439999999999999e-07, "loss": 0.0, "num_tokens": 14004859.0, "reward": 0.8984022736549377, "reward_std": 0.7316007614135742, "rewards/reward_func/mean": 0.8984022736549377, "rewards/reward_func/std": 0.7316007018089294, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.272446870803833, "sampling/importance_sampling_ratio/mean": 0.9895340204238892, "sampling/importance_sampling_ratio/min": 0.10593052953481674, "sampling/sampling_logp_difference/max": 2.244971752166748, "sampling/sampling_logp_difference/mean": 0.021232325583696365, "step": 40, "step_time": 422.6681289859116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11990.0, "completions/max_terminated_length": 11990.0, "completions/mean_length": 8507.84375, "completions/mean_terminated_length": 8507.84375, "completions/min_length": 4808.0, "completions/min_terminated_length": 4808.0, "entropy": 0.29619857482612133, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.4351663589477539, "learning_rate": 8.399999999999999e-07, "loss": 0.029, "num_tokens": 14290710.0, "reward": 1.4823139905929565, "reward_std": 0.973460853099823, "rewards/reward_func/mean": 1.4823139905929565, "rewards/reward_func/std": 0.973460853099823, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9896551370620728, "sampling/importance_sampling_ratio/min": 0.32150232791900635, "sampling/sampling_logp_difference/max": 1.4671876430511475, "sampling/sampling_logp_difference/mean": 0.020717762410640717, "step": 41, "step_time": 281.4393497968558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16896.0, "completions/max_terminated_length": 16896.0, "completions/mean_length": 11579.0, "completions/mean_terminated_length": 11579.0, "completions/min_length": 7481.0, "completions/min_terminated_length": 7481.0, "entropy": 0.25882996898144484, "epoch": 0.34146341463414637, "frac_reward_zero_std": 0.0, "grad_norm": 0.3679807484149933, "learning_rate": 8.359999999999999e-07, "loss": 0.032, "num_tokens": 14682342.0, "reward": 2.730255365371704, "reward_std": 13.133237838745117, "rewards/reward_func/mean": 2.734161615371704, "rewards/reward_func/std": 13.132379531860352, "rewards/soft_overlong_punishment_reward/mean": -0.00390625, "rewards/soft_overlong_punishment_reward/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.990980863571167, "sampling/importance_sampling_ratio/min": 0.017838984727859497, "sampling/sampling_logp_difference/max": 4.026369094848633, "sampling/sampling_logp_difference/mean": 0.018732603639364243, "step": 42, "step_time": 419.91274624480866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15685.0, "completions/max_terminated_length": 15685.0, "completions/mean_length": 12179.1875, "completions/mean_terminated_length": 12179.1875, "completions/min_length": 8132.0, "completions/min_terminated_length": 8132.0, "entropy": 0.2707740031182766, "epoch": 0.34959349593495936, "frac_reward_zero_std": 0.0, "grad_norm": 0.3744344115257263, "learning_rate": 8.319999999999999e-07, "loss": -0.0033, "num_tokens": 15096724.0, "reward": 0.6232322454452515, "reward_std": 0.8935548663139343, "rewards/reward_func/mean": 0.6232322454452515, "rewards/reward_func/std": 0.8935548067092896, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9905098676681519, "sampling/importance_sampling_ratio/min": 0.30308058857917786, "sampling/sampling_logp_difference/max": 1.1937564611434937, "sampling/sampling_logp_difference/mean": 0.019495470449328423, "step": 43, "step_time": 603.9360419125296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14580.0, "completions/max_terminated_length": 14580.0, "completions/mean_length": 7975.375, "completions/mean_terminated_length": 7975.375, "completions/min_length": 3591.0, "completions/min_terminated_length": 3591.0, "entropy": 0.3197885435074568, "epoch": 0.35772357723577236, "frac_reward_zero_std": 0.25, "grad_norm": 0.3738175928592682, "learning_rate": 8.28e-07, "loss": -0.0016, "num_tokens": 15360248.0, "reward": 0.7166826725006104, "reward_std": 0.6693564057350159, "rewards/reward_func/mean": 0.7166826725006104, "rewards/reward_func/std": 0.6693564057350159, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.324842929840088, "sampling/importance_sampling_ratio/mean": 0.9886108040809631, "sampling/importance_sampling_ratio/min": 0.3598307967185974, "sampling/sampling_logp_difference/max": 1.0221214294433594, "sampling/sampling_logp_difference/mean": 0.022567734122276306, "step": 44, "step_time": 345.220199523028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 20480.0, "completions/max_terminated_length": 19443.0, "completions/mean_length": 10098.34375, "completions/mean_terminated_length": 9024.37890625, "completions/min_length": 2638.0, "completions/min_terminated_length": 2638.0, "entropy": 0.2705630399286747, "epoch": 0.36585365853658536, "frac_reward_zero_std": 0.0, "grad_norm": 0.3647894561290741, "learning_rate": 8.24e-07, "loss": -0.0494, "num_tokens": 15697299.0, "reward": 0.4475719630718231, "reward_std": 0.769990086555481, "rewards/reward_func/mean": 0.6297746896743774, "rewards/reward_func/std": 0.5889378786087036, "rewards/soft_overlong_punishment_reward/mean": -0.12316131591796875, "rewards/soft_overlong_punishment_reward/std": 0.3167433738708496, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901247024536133, "sampling/importance_sampling_ratio/min": 0.1971900314092636, "sampling/sampling_logp_difference/max": 1.6235873699188232, "sampling/sampling_logp_difference/mean": 0.02022046223282814, "step": 45, "step_time": 460.2549244968686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15474.0, "completions/max_terminated_length": 15474.0, "completions/mean_length": 10772.21875, "completions/mean_terminated_length": 10772.21875, "completions/min_length": 5099.0, "completions/min_terminated_length": 5099.0, "entropy": 0.28201122768223286, "epoch": 0.37398373983739835, "frac_reward_zero_std": 0.0, "grad_norm": 0.36336085200309753, "learning_rate": 8.199999999999999e-07, "loss": -0.0064, "num_tokens": 16060146.0, "reward": 0.6875309944152832, "reward_std": 0.8879403471946716, "rewards/reward_func/mean": 0.6875309944152832, "rewards/reward_func/std": 0.8879403471946716, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.298703670501709, "sampling/importance_sampling_ratio/mean": 0.9902758598327637, "sampling/importance_sampling_ratio/min": 0.1755373477935791, "sampling/sampling_logp_difference/max": 1.739903450012207, "sampling/sampling_logp_difference/mean": 0.019840704277157784, "step": 46, "step_time": 391.7966483985074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12558.0, "completions/max_terminated_length": 12558.0, "completions/mean_length": 10090.78125, "completions/mean_terminated_length": 10090.78125, "completions/min_length": 5820.0, "completions/min_terminated_length": 5820.0, "entropy": 0.27039683051407337, "epoch": 0.3821138211382114, "frac_reward_zero_std": 0.25, "grad_norm": 0.31569579243659973, "learning_rate": 8.159999999999999e-07, "loss": -0.009, "num_tokens": 16401115.0, "reward": 0.7058383226394653, "reward_std": 0.6476169228553772, "rewards/reward_func/mean": 0.7058383226394653, "rewards/reward_func/std": 0.6476169228553772, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0942885875701904, "sampling/importance_sampling_ratio/mean": 0.9906511902809143, "sampling/importance_sampling_ratio/min": 0.20646513998508453, "sampling/sampling_logp_difference/max": 1.577623724937439, "sampling/sampling_logp_difference/mean": 0.019238049164414406, "step": 47, "step_time": 322.8998982391786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16934.0, "completions/max_terminated_length": 16934.0, "completions/mean_length": 10200.96875, "completions/mean_terminated_length": 10200.96875, "completions/min_length": 4126.0, "completions/min_terminated_length": 4126.0, "entropy": 0.2863293197005987, "epoch": 0.3902439024390244, "frac_reward_zero_std": 0.0, "grad_norm": 0.4074859917163849, "learning_rate": 8.12e-07, "loss": 0.0823, "num_tokens": 16748266.0, "reward": 2.437450647354126, "reward_std": 4.401010990142822, "rewards/reward_func/mean": 2.4416468143463135, "rewards/reward_func/std": 4.398542881011963, "rewards/soft_overlong_punishment_reward/mean": -0.0041961669921875, "rewards/soft_overlong_punishment_reward/std": 0.02373710460960865, "sampling/importance_sampling_ratio/max": 2.655956745147705, "sampling/importance_sampling_ratio/mean": 0.9903603196144104, "sampling/importance_sampling_ratio/min": 0.21017666161060333, "sampling/sampling_logp_difference/max": 1.5598068237304688, "sampling/sampling_logp_difference/mean": 0.02004138007760048, "step": 48, "step_time": 607.1050487488974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15864.0, "completions/max_terminated_length": 15864.0, "completions/mean_length": 10736.78125, "completions/mean_terminated_length": 10736.78125, "completions/min_length": 7612.0, "completions/min_terminated_length": 7612.0, "entropy": 0.26676939986646175, "epoch": 0.3983739837398374, "frac_reward_zero_std": 0.0, "grad_norm": 0.3676176965236664, "learning_rate": 8.08e-07, "loss": -0.0161, "num_tokens": 17112683.0, "reward": 1.2646254301071167, "reward_std": 0.9504536390304565, "rewards/reward_func/mean": 1.2646254301071167, "rewards/reward_func/std": 0.9504537582397461, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9370557069778442, "sampling/importance_sampling_ratio/mean": 0.9906842708587646, "sampling/importance_sampling_ratio/min": 0.3161299228668213, "sampling/sampling_logp_difference/max": 1.151602029800415, "sampling/sampling_logp_difference/mean": 0.019057895988225937, "step": 49, "step_time": 552.4623964948114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17648.0, "completions/max_terminated_length": 17648.0, "completions/mean_length": 12040.8125, "completions/mean_terminated_length": 12040.8125, "completions/min_length": 7400.0, "completions/min_terminated_length": 7400.0, "entropy": 0.26970171742141247, "epoch": 0.4065040650406504, "frac_reward_zero_std": 0.0, "grad_norm": 0.35298776626586914, "learning_rate": 8.04e-07, "loss": 0.0134, "num_tokens": 17522533.0, "reward": 0.883129358291626, "reward_std": 0.6038604974746704, "rewards/reward_func/mean": 0.892772912979126, "rewards/reward_func/std": 0.5872755646705627, "rewards/soft_overlong_punishment_reward/mean": -0.0096435546875, "rewards/soft_overlong_punishment_reward/std": 0.05455218255519867, "sampling/importance_sampling_ratio/max": 2.493983745574951, "sampling/importance_sampling_ratio/mean": 0.9904340505599976, "sampling/importance_sampling_ratio/min": 0.058902557939291, "sampling/sampling_logp_difference/max": 2.8318707942962646, "sampling/sampling_logp_difference/mean": 0.01974073052406311, "step": 50, "step_time": 399.5289772397373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16737.0, "completions/max_terminated_length": 16737.0, "completions/mean_length": 11610.8125, "completions/mean_terminated_length": 11610.8125, "completions/min_length": 6722.0, "completions/min_terminated_length": 6722.0, "entropy": 0.27386337146162987, "epoch": 0.4146341463414634, "frac_reward_zero_std": 0.0, "grad_norm": 0.3504253029823303, "learning_rate": 8e-07, "loss": 0.0388, "num_tokens": 17911655.0, "reward": 0.27469968795776367, "reward_std": 0.5160353183746338, "rewards/reward_func/mean": 0.2773928642272949, "rewards/reward_func/std": 0.5143131017684937, "rewards/soft_overlong_punishment_reward/mean": -0.00269317626953125, "rewards/soft_overlong_punishment_reward/std": 0.015234904363751411, "sampling/importance_sampling_ratio/max": 2.0953574180603027, "sampling/importance_sampling_ratio/mean": 0.9904600381851196, "sampling/importance_sampling_ratio/min": 0.2542196810245514, "sampling/sampling_logp_difference/max": 1.3695564270019531, "sampling/sampling_logp_difference/mean": 0.019640503451228142, "step": 51, "step_time": 399.3099301927723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15086.0, "completions/max_terminated_length": 15086.0, "completions/mean_length": 10634.1875, "completions/mean_terminated_length": 10634.1875, "completions/min_length": 6556.0, "completions/min_terminated_length": 6556.0, "entropy": 0.30065641924738884, "epoch": 0.42276422764227645, "frac_reward_zero_std": 0.0, "grad_norm": 0.45073068141937256, "learning_rate": 7.96e-07, "loss": -0.0302, "num_tokens": 18269029.0, "reward": 0.7840328216552734, "reward_std": 0.9923391342163086, "rewards/reward_func/mean": 0.7840328216552734, "rewards/reward_func/std": 0.9923390746116638, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3403565883636475, "sampling/importance_sampling_ratio/mean": 0.9895017147064209, "sampling/importance_sampling_ratio/min": 0.001712719677016139, "sampling/sampling_logp_difference/max": 6.369672775268555, "sampling/sampling_logp_difference/mean": 0.021533269435167313, "step": 52, "step_time": 380.3188033842016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14572.0, "completions/max_terminated_length": 14572.0, "completions/mean_length": 10499.125, "completions/mean_terminated_length": 10499.125, "completions/min_length": 6328.0, "completions/min_terminated_length": 6328.0, "entropy": 0.283049238845706, "epoch": 0.43089430894308944, "frac_reward_zero_std": 0.0, "grad_norm": 0.37461337447166443, "learning_rate": 7.92e-07, "loss": -0.0126, "num_tokens": 18619073.0, "reward": 0.9159905910491943, "reward_std": 0.7809432744979858, "rewards/reward_func/mean": 0.9159905910491943, "rewards/reward_func/std": 0.7809432148933411, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.277055263519287, "sampling/importance_sampling_ratio/mean": 0.9902773499488831, "sampling/importance_sampling_ratio/min": 0.279052734375, "sampling/sampling_logp_difference/max": 1.276354432106018, "sampling/sampling_logp_difference/mean": 0.01988813653588295, "step": 53, "step_time": 503.7549276971258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14762.0, "completions/max_terminated_length": 14762.0, "completions/mean_length": 10943.90625, "completions/mean_terminated_length": 10943.90625, "completions/min_length": 7617.0, "completions/min_terminated_length": 7617.0, "entropy": 0.2909552175551653, "epoch": 0.43902439024390244, "frac_reward_zero_std": 0.0, "grad_norm": 0.40936458110809326, "learning_rate": 7.88e-07, "loss": -0.0008, "num_tokens": 18986318.0, "reward": 0.9623677730560303, "reward_std": 0.6776258945465088, "rewards/reward_func/mean": 0.9623677730560303, "rewards/reward_func/std": 0.677625834941864, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.733780860900879, "sampling/importance_sampling_ratio/mean": 0.9898478388786316, "sampling/importance_sampling_ratio/min": 0.11436156183481216, "sampling/sampling_logp_difference/max": 2.1683902740478516, "sampling/sampling_logp_difference/mean": 0.020851192995905876, "step": 54, "step_time": 412.2992678086739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13410.0, "completions/max_terminated_length": 13410.0, "completions/mean_length": 8160.375, "completions/mean_terminated_length": 8160.375, "completions/min_length": 3340.0, "completions/min_terminated_length": 3340.0, "entropy": 0.29048606380820274, "epoch": 0.44715447154471544, "frac_reward_zero_std": 0.0, "grad_norm": 0.46355557441711426, "learning_rate": 7.84e-07, "loss": 0.0469, "num_tokens": 19264786.0, "reward": 1.9834721088409424, "reward_std": 1.6326876878738403, "rewards/reward_func/mean": 1.9834721088409424, "rewards/reward_func/std": 1.6326874494552612, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3705220222473145, "sampling/importance_sampling_ratio/mean": 0.9901043176651001, "sampling/importance_sampling_ratio/min": 0.23068562150001526, "sampling/sampling_logp_difference/max": 1.466699481010437, "sampling/sampling_logp_difference/mean": 0.01999868080019951, "step": 55, "step_time": 352.4861610988155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16042.0, "completions/max_terminated_length": 16042.0, "completions/mean_length": 11029.0, "completions/mean_terminated_length": 11029.0, "completions/min_length": 7361.0, "completions/min_terminated_length": 7361.0, "entropy": 0.2770060170441866, "epoch": 0.45528455284552843, "frac_reward_zero_std": 0.0, "grad_norm": 0.35951629281044006, "learning_rate": 7.799999999999999e-07, "loss": 0.01, "num_tokens": 19632250.0, "reward": 0.7494408488273621, "reward_std": 0.6710403561592102, "rewards/reward_func/mean": 0.7494408488273621, "rewards/reward_func/std": 0.6710402965545654, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9904565811157227, "sampling/importance_sampling_ratio/min": 0.19462724030017853, "sampling/sampling_logp_difference/max": 1.6366691589355469, "sampling/sampling_logp_difference/mean": 0.01998082734644413, "step": 56, "step_time": 580.471678117523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16223.0, "completions/max_terminated_length": 16223.0, "completions/mean_length": 10092.53125, "completions/mean_terminated_length": 10092.53125, "completions/min_length": 3841.0, "completions/min_terminated_length": 3841.0, "entropy": 0.27271045185625553, "epoch": 0.4634146341463415, "frac_reward_zero_std": 0.0, "grad_norm": 0.40269845724105835, "learning_rate": 7.76e-07, "loss": 0.0223, "num_tokens": 19975051.0, "reward": 0.7949599027633667, "reward_std": 0.6612140536308289, "rewards/reward_func/mean": 0.7949599027633667, "rewards/reward_func/std": 0.6612139940261841, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9906999468803406, "sampling/importance_sampling_ratio/min": 0.2476702779531479, "sampling/sampling_logp_difference/max": 2.0463411808013916, "sampling/sampling_logp_difference/mean": 0.019406329840421677, "step": 57, "step_time": 399.2233660905622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15678.0, "completions/max_terminated_length": 15678.0, "completions/mean_length": 10561.5, "completions/mean_terminated_length": 10561.5, "completions/min_length": 4751.0, "completions/min_terminated_length": 4751.0, "entropy": 0.2965373918414116, "epoch": 0.4715447154471545, "frac_reward_zero_std": 0.0, "grad_norm": 0.3839688003063202, "learning_rate": 7.72e-07, "loss": 0.0454, "num_tokens": 20327715.0, "reward": 0.7230905294418335, "reward_std": 0.6203462481498718, "rewards/reward_func/mean": 0.7230905294418335, "rewards/reward_func/std": 0.6203462481498718, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.706266164779663, "sampling/importance_sampling_ratio/mean": 0.9895825982093811, "sampling/importance_sampling_ratio/min": 0.18361897766590118, "sampling/sampling_logp_difference/max": 1.694892406463623, "sampling/sampling_logp_difference/mean": 0.020958570763468742, "step": 58, "step_time": 389.2628999436274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17072.0, "completions/max_terminated_length": 17072.0, "completions/mean_length": 9915.34375, "completions/mean_terminated_length": 9915.34375, "completions/min_length": 4302.0, "completions/min_terminated_length": 4302.0, "entropy": 0.3017471991479397, "epoch": 0.4796747967479675, "frac_reward_zero_std": 0.0, "grad_norm": 0.4128898084163666, "learning_rate": 7.68e-07, "loss": 0.0444, "num_tokens": 20657838.0, "reward": 1.6393874883651733, "reward_std": 1.3633579015731812, "rewards/reward_func/mean": 1.6446365118026733, "rewards/reward_func/std": 1.3566806316375732, "rewards/soft_overlong_punishment_reward/mean": -0.0052490234375, "rewards/soft_overlong_punishment_reward/std": 0.029692960903048515, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9894176721572876, "sampling/importance_sampling_ratio/min": 0.06410310417413712, "sampling/sampling_logp_difference/max": 2.747262477874756, "sampling/sampling_logp_difference/mean": 0.021471522748470306, "step": 59, "step_time": 475.307358373655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19170.0, "completions/max_terminated_length": 19170.0, "completions/mean_length": 9260.875, "completions/mean_terminated_length": 9260.875, "completions/min_length": 1932.0, "completions/min_terminated_length": 1932.0, "entropy": 0.28078791592270136, "epoch": 0.4878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.4470166265964508, "learning_rate": 7.64e-07, "loss": 0.0672, "num_tokens": 20979498.0, "reward": 0.8571816682815552, "reward_std": 0.9841914176940918, "rewards/reward_func/mean": 0.8915902376174927, "rewards/reward_func/std": 0.9419331550598145, "rewards/soft_overlong_punishment_reward/mean": -0.0344085693359375, "rewards/soft_overlong_punishment_reward/std": 0.13934272527694702, "sampling/importance_sampling_ratio/max": 2.326169729232788, "sampling/importance_sampling_ratio/mean": 0.9909319877624512, "sampling/importance_sampling_ratio/min": 0.1588689088821411, "sampling/sampling_logp_difference/max": 1.8396759033203125, "sampling/sampling_logp_difference/mean": 0.01866196282207966, "step": 60, "step_time": 525.5024625072256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17185.0, "completions/max_terminated_length": 17185.0, "completions/mean_length": 10522.3125, "completions/mean_terminated_length": 10522.3125, "completions/min_length": 5064.0, "completions/min_terminated_length": 5064.0, "entropy": 0.29093262180685997, "epoch": 0.4959349593495935, "frac_reward_zero_std": 0.0, "grad_norm": 0.4672902822494507, "learning_rate": 7.599999999999999e-07, "loss": -0.0478, "num_tokens": 21332124.0, "reward": 0.6969435214996338, "reward_std": 0.6751585602760315, "rewards/reward_func/mean": 0.703054666519165, "rewards/reward_func/std": 0.6807836890220642, "rewards/soft_overlong_punishment_reward/mean": -0.00611114501953125, "rewards/soft_overlong_punishment_reward/std": 0.03456985577940941, "sampling/importance_sampling_ratio/max": 2.0167672634124756, "sampling/importance_sampling_ratio/mean": 0.9900118112564087, "sampling/importance_sampling_ratio/min": 0.29739901423454285, "sampling/sampling_logp_difference/max": 1.2126805782318115, "sampling/sampling_logp_difference/mean": 0.020228687673807144, "step": 61, "step_time": 385.8926363585051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18314.0, "completions/max_terminated_length": 18314.0, "completions/mean_length": 10564.25, "completions/mean_terminated_length": 10564.25, "completions/min_length": 7173.0, "completions/min_terminated_length": 7173.0, "entropy": 0.2720603086054325, "epoch": 0.5040650406504065, "frac_reward_zero_std": 0.0, "grad_norm": 0.443337082862854, "learning_rate": 7.559999999999999e-07, "loss": -0.0205, "num_tokens": 21691372.0, "reward": 1.099975347518921, "reward_std": 0.7780046463012695, "rewards/reward_func/mean": 1.1196210384368896, "rewards/reward_func/std": 0.7830961346626282, "rewards/soft_overlong_punishment_reward/mean": -0.01964569091796875, "rewards/soft_overlong_punishment_reward/std": 0.08696826547384262, "sampling/importance_sampling_ratio/max": 2.837948799133301, "sampling/importance_sampling_ratio/mean": 0.990537166595459, "sampling/importance_sampling_ratio/min": 0.06719715893268585, "sampling/sampling_logp_difference/max": 2.7001242637634277, "sampling/sampling_logp_difference/mean": 0.019431207329034805, "step": 62, "step_time": 443.5746869649738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14595.0, "completions/max_terminated_length": 14595.0, "completions/mean_length": 10805.375, "completions/mean_terminated_length": 10805.375, "completions/min_length": 5937.0, "completions/min_terminated_length": 5937.0, "entropy": 0.27174041885882616, "epoch": 0.5121951219512195, "frac_reward_zero_std": 0.0, "grad_norm": 0.5056492686271667, "learning_rate": 7.52e-07, "loss": 0.009, "num_tokens": 22063440.0, "reward": 1.1353144645690918, "reward_std": 0.8601508736610413, "rewards/reward_func/mean": 1.1353144645690918, "rewards/reward_func/std": 0.8601508736610413, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1125802993774414, "sampling/importance_sampling_ratio/mean": 0.9905575513839722, "sampling/importance_sampling_ratio/min": 0.0975913405418396, "sampling/sampling_logp_difference/max": 2.3269665241241455, "sampling/sampling_logp_difference/mean": 0.019539430737495422, "step": 63, "step_time": 380.66839726571925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14770.0, "completions/max_terminated_length": 14770.0, "completions/mean_length": 9684.9375, "completions/mean_terminated_length": 9684.9375, "completions/min_length": 4925.0, "completions/min_terminated_length": 4925.0, "entropy": 0.27461021207273006, "epoch": 0.5203252032520326, "frac_reward_zero_std": 0.0, "grad_norm": 0.37078821659088135, "learning_rate": 7.48e-07, "loss": 0.0588, "num_tokens": 22385238.0, "reward": 1.3261408805847168, "reward_std": 0.6705518364906311, "rewards/reward_func/mean": 1.3261408805847168, "rewards/reward_func/std": 0.6705518364906311, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.990297794342041, "sampling/importance_sampling_ratio/min": 0.06889015436172485, "sampling/sampling_logp_difference/max": 2.6752419471740723, "sampling/sampling_logp_difference/mean": 0.0197505634278059, "step": 64, "step_time": 334.4651955710724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12913.0, "completions/max_terminated_length": 12913.0, "completions/mean_length": 9230.75, "completions/mean_terminated_length": 9230.75, "completions/min_length": 4211.0, "completions/min_terminated_length": 4211.0, "entropy": 0.29658956080675125, "epoch": 0.5284552845528455, "frac_reward_zero_std": 0.0, "grad_norm": 0.4264526069164276, "learning_rate": 7.44e-07, "loss": -0.0405, "num_tokens": 22691030.0, "reward": 1.2184321880340576, "reward_std": 1.2537956237792969, "rewards/reward_func/mean": 1.2184321880340576, "rewards/reward_func/std": 1.2537956237792969, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5008950233459473, "sampling/importance_sampling_ratio/mean": 0.9899071455001831, "sampling/importance_sampling_ratio/min": 0.25040245056152344, "sampling/sampling_logp_difference/max": 1.3846858739852905, "sampling/sampling_logp_difference/mean": 0.021074209362268448, "step": 65, "step_time": 428.82489290018566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14838.0, "completions/max_terminated_length": 14838.0, "completions/mean_length": 10752.8125, "completions/mean_terminated_length": 10752.8125, "completions/min_length": 6496.0, "completions/min_terminated_length": 6496.0, "entropy": 0.24973559752106667, "epoch": 0.5365853658536586, "frac_reward_zero_std": 0.0, "grad_norm": 0.375764399766922, "learning_rate": 7.4e-07, "loss": -0.0081, "num_tokens": 23060872.0, "reward": 0.6262708306312561, "reward_std": 0.6390258073806763, "rewards/reward_func/mean": 0.6262708306312561, "rewards/reward_func/std": 0.6390258073806763, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5996079444885254, "sampling/importance_sampling_ratio/mean": 0.9914066791534424, "sampling/importance_sampling_ratio/min": 0.24700190126895905, "sampling/sampling_logp_difference/max": 1.3983592987060547, "sampling/sampling_logp_difference/mean": 0.018258236348628998, "step": 66, "step_time": 422.3102921405807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19187.0, "completions/max_terminated_length": 19187.0, "completions/mean_length": 10835.5625, "completions/mean_terminated_length": 10835.5625, "completions/min_length": 6083.0, "completions/min_terminated_length": 6083.0, "entropy": 0.2816160637885332, "epoch": 0.5447154471544715, "frac_reward_zero_std": 0.0, "grad_norm": 0.40830785036087036, "learning_rate": 7.359999999999999e-07, "loss": 0.0398, "num_tokens": 23421858.0, "reward": 0.5286274552345276, "reward_std": 0.5896045565605164, "rewards/reward_func/mean": 0.5500126481056213, "rewards/reward_func/std": 0.5576050877571106, "rewards/soft_overlong_punishment_reward/mean": -0.02138519287109375, "rewards/soft_overlong_punishment_reward/std": 0.12097291648387909, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901546239852905, "sampling/importance_sampling_ratio/min": 0.0072767846286296844, "sampling/sampling_logp_difference/max": 4.923066139221191, "sampling/sampling_logp_difference/mean": 0.020440150052309036, "step": 67, "step_time": 582.3627753325272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14479.0, "completions/max_terminated_length": 14479.0, "completions/mean_length": 9882.84375, "completions/mean_terminated_length": 9882.84375, "completions/min_length": 5433.0, "completions/min_terminated_length": 5433.0, "entropy": 0.3057326711714268, "epoch": 0.5528455284552846, "frac_reward_zero_std": 0.0, "grad_norm": 0.42447325587272644, "learning_rate": 7.319999999999999e-07, "loss": 0.0463, "num_tokens": 23753165.0, "reward": 1.0461750030517578, "reward_std": 1.0070804357528687, "rewards/reward_func/mean": 1.0461750030517578, "rewards/reward_func/std": 1.007080316543579, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.217542886734009, "sampling/importance_sampling_ratio/mean": 0.989412248134613, "sampling/importance_sampling_ratio/min": 3.028254241144168e-06, "sampling/sampling_logp_difference/max": 12.707524299621582, "sampling/sampling_logp_difference/mean": 0.021606117486953735, "step": 68, "step_time": 338.7651969024446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16279.0, "completions/max_terminated_length": 16279.0, "completions/mean_length": 9928.25, "completions/mean_terminated_length": 9928.25, "completions/min_length": 5870.0, "completions/min_terminated_length": 5870.0, "entropy": 0.2900556083768606, "epoch": 0.5609756097560976, "frac_reward_zero_std": 0.0, "grad_norm": 0.5427579879760742, "learning_rate": 7.28e-07, "loss": 0.0345, "num_tokens": 24090621.0, "reward": 1.0686248540878296, "reward_std": 0.8566891551017761, "rewards/reward_func/mean": 1.0686248540878296, "rewards/reward_func/std": 0.8566891551017761, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9899448156356812, "sampling/importance_sampling_ratio/min": 1.2153052011854015e-05, "sampling/sampling_logp_difference/max": 11.317930221557617, "sampling/sampling_logp_difference/mean": 0.020517408847808838, "step": 69, "step_time": 445.5511495040264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14088.0, "completions/max_terminated_length": 14088.0, "completions/mean_length": 8179.03125, "completions/mean_terminated_length": 8179.03125, "completions/min_length": 3039.0, "completions/min_terminated_length": 3039.0, "entropy": 0.2969997953623533, "epoch": 0.5691056910569106, "frac_reward_zero_std": 0.0, "grad_norm": 0.4507758319377899, "learning_rate": 7.24e-07, "loss": 0.0584, "num_tokens": 24369758.0, "reward": 1.1863713264465332, "reward_std": 0.8616743087768555, "rewards/reward_func/mean": 1.1863713264465332, "rewards/reward_func/std": 0.8616743087768555, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5807595252990723, "sampling/importance_sampling_ratio/mean": 0.989867091178894, "sampling/importance_sampling_ratio/min": 0.2350781410932541, "sampling/sampling_logp_difference/max": 1.4478373527526855, "sampling/sampling_logp_difference/mean": 0.020613517612218857, "step": 70, "step_time": 346.7764633935876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13559.0, "completions/max_terminated_length": 13559.0, "completions/mean_length": 10056.625, "completions/mean_terminated_length": 10056.625, "completions/min_length": 6305.0, "completions/min_terminated_length": 6305.0, "entropy": 0.2842098120599985, "epoch": 0.5772357723577236, "frac_reward_zero_std": 0.0, "grad_norm": 0.39119473099708557, "learning_rate": 7.2e-07, "loss": 0.0067, "num_tokens": 24707362.0, "reward": 1.3005447387695312, "reward_std": 1.250781536102295, "rewards/reward_func/mean": 1.3005447387695312, "rewards/reward_func/std": 1.250781536102295, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6534061431884766, "sampling/importance_sampling_ratio/mean": 0.9900968670845032, "sampling/importance_sampling_ratio/min": 0.25348275899887085, "sampling/sampling_logp_difference/max": 1.3724594116210938, "sampling/sampling_logp_difference/mean": 0.02001977153122425, "step": 71, "step_time": 538.6896389278118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15581.0, "completions/max_terminated_length": 15581.0, "completions/mean_length": 10180.1875, "completions/mean_terminated_length": 10180.1875, "completions/min_length": 5827.0, "completions/min_terminated_length": 5827.0, "entropy": 0.26517100259661674, "epoch": 0.5853658536585366, "frac_reward_zero_std": 0.0, "grad_norm": 0.38772645592689514, "learning_rate": 7.159999999999999e-07, "loss": 0.0407, "num_tokens": 25052880.0, "reward": 1.7468311786651611, "reward_std": 4.23643684387207, "rewards/reward_func/mean": 1.7468311786651611, "rewards/reward_func/std": 4.23643684387207, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2769153118133545, "sampling/importance_sampling_ratio/mean": 0.9908599257469177, "sampling/importance_sampling_ratio/min": 0.07548689097166061, "sampling/sampling_logp_difference/max": 2.583796262741089, "sampling/sampling_logp_difference/mean": 0.018888521939516068, "step": 72, "step_time": 384.7713236459531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14703.0, "completions/max_terminated_length": 14703.0, "completions/mean_length": 10991.09375, "completions/mean_terminated_length": 10991.09375, "completions/min_length": 7374.0, "completions/min_terminated_length": 7374.0, "entropy": 0.2600755328312516, "epoch": 0.5934959349593496, "frac_reward_zero_std": 0.0, "grad_norm": 0.37328991293907166, "learning_rate": 7.119999999999999e-07, "loss": 0.0361, "num_tokens": 25433011.0, "reward": 0.9751912355422974, "reward_std": 1.0739766359329224, "rewards/reward_func/mean": 0.9751912355422974, "rewards/reward_func/std": 1.0739765167236328, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9908562898635864, "sampling/importance_sampling_ratio/min": 0.12725582718849182, "sampling/sampling_logp_difference/max": 2.061555862426758, "sampling/sampling_logp_difference/mean": 0.01900428719818592, "step": 73, "step_time": 383.6669119540602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14219.0, "completions/max_terminated_length": 14219.0, "completions/mean_length": 9966.125, "completions/mean_terminated_length": 9966.125, "completions/min_length": 5639.0, "completions/min_terminated_length": 5639.0, "entropy": 0.27439984772354364, "epoch": 0.6016260162601627, "frac_reward_zero_std": 0.0, "grad_norm": 0.3848857581615448, "learning_rate": 7.079999999999999e-07, "loss": 0.0073, "num_tokens": 25773751.0, "reward": 0.7620717287063599, "reward_std": 0.6133687496185303, "rewards/reward_func/mean": 0.7620717287063599, "rewards/reward_func/std": 0.6133686900138855, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2783548831939697, "sampling/importance_sampling_ratio/mean": 0.9905116558074951, "sampling/importance_sampling_ratio/min": 0.24211688339710236, "sampling/sampling_logp_difference/max": 1.418334722518921, "sampling/sampling_logp_difference/mean": 0.01978294551372528, "step": 74, "step_time": 385.7722886954434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13233.0, "completions/max_terminated_length": 13233.0, "completions/mean_length": 8152.53125, "completions/mean_terminated_length": 8152.53125, "completions/min_length": 5875.0, "completions/min_terminated_length": 5875.0, "entropy": 0.2985440846532583, "epoch": 0.6097560975609756, "frac_reward_zero_std": 0.0, "grad_norm": 0.43377742171287537, "learning_rate": 7.04e-07, "loss": -0.035, "num_tokens": 26044104.0, "reward": 1.812477469444275, "reward_std": 1.1994365453720093, "rewards/reward_func/mean": 1.812477469444275, "rewards/reward_func/std": 1.1994365453720093, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.35860276222229, "sampling/importance_sampling_ratio/mean": 0.9896669387817383, "sampling/importance_sampling_ratio/min": 0.06281934678554535, "sampling/sampling_logp_difference/max": 2.7674922943115234, "sampling/sampling_logp_difference/mean": 0.021110281348228455, "step": 75, "step_time": 291.5811023626011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15224.0, "completions/max_terminated_length": 15224.0, "completions/mean_length": 10790.53125, "completions/mean_terminated_length": 10790.53125, "completions/min_length": 7823.0, "completions/min_terminated_length": 7823.0, "entropy": 0.2837300254032016, "epoch": 0.6178861788617886, "frac_reward_zero_std": 0.0, "grad_norm": 0.38661646842956543, "learning_rate": 7e-07, "loss": -0.0108, "num_tokens": 26413337.0, "reward": 0.9118523597717285, "reward_std": 0.8564381003379822, "rewards/reward_func/mean": 0.9118523597717285, "rewards/reward_func/std": 0.8564381003379822, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.960435628890991, "sampling/importance_sampling_ratio/mean": 0.9901319742202759, "sampling/importance_sampling_ratio/min": 0.23254553973674774, "sampling/sampling_logp_difference/max": 1.4586691856384277, "sampling/sampling_logp_difference/mean": 0.019878923892974854, "step": 76, "step_time": 380.29050638340414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16985.0, "completions/max_terminated_length": 16985.0, "completions/mean_length": 9770.53125, "completions/mean_terminated_length": 9770.53125, "completions/min_length": 4901.0, "completions/min_terminated_length": 4901.0, "entropy": 0.2725865785032511, "epoch": 0.6260162601626016, "frac_reward_zero_std": 0.0, "grad_norm": 0.41249147057533264, "learning_rate": 6.959999999999999e-07, "loss": 0.0794, "num_tokens": 26749218.0, "reward": 0.9970647096633911, "reward_std": 0.8713412284851074, "rewards/reward_func/mean": 1.0016499757766724, "rewards/reward_func/std": 0.8657678961753845, "rewards/soft_overlong_punishment_reward/mean": -0.00458526611328125, "rewards/soft_overlong_punishment_reward/std": 0.025938183069229126, "sampling/importance_sampling_ratio/max": 2.4935269355773926, "sampling/importance_sampling_ratio/mean": 0.9904952645301819, "sampling/importance_sampling_ratio/min": 1.930824146256782e-05, "sampling/sampling_logp_difference/max": 10.854978561401367, "sampling/sampling_logp_difference/mean": 0.019513940438628197, "step": 77, "step_time": 404.0872638344299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20081.0, "completions/max_terminated_length": 20081.0, "completions/mean_length": 8572.0625, "completions/mean_terminated_length": 8572.0625, "completions/min_length": 4595.0, "completions/min_terminated_length": 4595.0, "entropy": 0.29921186715364456, "epoch": 0.6341463414634146, "frac_reward_zero_std": 0.0, "grad_norm": 0.4499685764312744, "learning_rate": 6.919999999999999e-07, "loss": 0.0132, "num_tokens": 27042652.0, "reward": 0.7779614329338074, "reward_std": 0.7257394790649414, "rewards/reward_func/mean": 0.8061673045158386, "rewards/reward_func/std": 0.6761706471443176, "rewards/soft_overlong_punishment_reward/mean": -0.02820587158203125, "rewards/soft_overlong_punishment_reward/std": 0.15955650806427002, "sampling/importance_sampling_ratio/max": 2.7844414710998535, "sampling/importance_sampling_ratio/mean": 0.9895604848861694, "sampling/importance_sampling_ratio/min": 0.08751080930233002, "sampling/sampling_logp_difference/max": 2.435992956161499, "sampling/sampling_logp_difference/mean": 0.020626315847039223, "step": 78, "step_time": 375.787244503852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16246.0, "completions/max_terminated_length": 16246.0, "completions/mean_length": 9898.75, "completions/mean_terminated_length": 9898.75, "completions/min_length": 3833.0, "completions/min_terminated_length": 3833.0, "entropy": 0.30376508831977844, "epoch": 0.6422764227642277, "frac_reward_zero_std": 0.0, "grad_norm": 0.3933331072330475, "learning_rate": 6.879999999999999e-07, "loss": -0.0387, "num_tokens": 27372492.0, "reward": 0.9470673203468323, "reward_std": 0.6477987766265869, "rewards/reward_func/mean": 0.9470673203468323, "rewards/reward_func/std": 0.6477987170219421, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9050569534301758, "sampling/importance_sampling_ratio/mean": 0.9895777106285095, "sampling/importance_sampling_ratio/min": 0.05304478108882904, "sampling/sampling_logp_difference/max": 2.9366188049316406, "sampling/sampling_logp_difference/mean": 0.02129667066037655, "step": 79, "step_time": 359.03310281271115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12132.0, "completions/max_terminated_length": 12132.0, "completions/mean_length": 8229.09375, "completions/mean_terminated_length": 8229.09375, "completions/min_length": 6227.0, "completions/min_terminated_length": 6227.0, "entropy": 0.30129735358059406, "epoch": 0.6504065040650406, "frac_reward_zero_std": 0.0, "grad_norm": 0.45306941866874695, "learning_rate": 6.84e-07, "loss": 0.0203, "num_tokens": 27650159.0, "reward": 2.131523847579956, "reward_std": 1.2827610969543457, "rewards/reward_func/mean": 2.131523847579956, "rewards/reward_func/std": 1.2827610969543457, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0942087173461914, "sampling/importance_sampling_ratio/mean": 0.989490270614624, "sampling/importance_sampling_ratio/min": 0.02814963459968567, "sampling/sampling_logp_difference/max": 3.570220947265625, "sampling/sampling_logp_difference/mean": 0.02144203893840313, "step": 80, "step_time": 300.18679245212115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15563.0, "completions/max_terminated_length": 15563.0, "completions/mean_length": 10849.8125, "completions/mean_terminated_length": 10849.8125, "completions/min_length": 7417.0, "completions/min_terminated_length": 7417.0, "entropy": 0.2771333325654268, "epoch": 0.6585365853658537, "frac_reward_zero_std": 0.0, "grad_norm": 0.3706999123096466, "learning_rate": 6.800000000000001e-07, "loss": 0.0067, "num_tokens": 28017993.0, "reward": 0.7668748497962952, "reward_std": 0.7185682654380798, "rewards/reward_func/mean": 0.7668748497962952, "rewards/reward_func/std": 0.7185682654380798, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9884979724884033, "sampling/importance_sampling_ratio/mean": 0.9902896881103516, "sampling/importance_sampling_ratio/min": 0.007819842547178268, "sampling/sampling_logp_difference/max": 4.851090908050537, "sampling/sampling_logp_difference/mean": 0.01989693194627762, "step": 81, "step_time": 368.69332744996063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14354.0, "completions/max_terminated_length": 14354.0, "completions/mean_length": 10310.65625, "completions/mean_terminated_length": 10310.65625, "completions/min_length": 7593.0, "completions/min_terminated_length": 7593.0, "entropy": 0.2430378319695592, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.36904025077819824, "learning_rate": 6.76e-07, "loss": -0.0143, "num_tokens": 28370734.0, "reward": 0.7325789928436279, "reward_std": 0.7706360816955566, "rewards/reward_func/mean": 0.7325789928436279, "rewards/reward_func/std": 0.7706360816955566, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3821725845336914, "sampling/importance_sampling_ratio/mean": 0.9915218949317932, "sampling/importance_sampling_ratio/min": 0.01873675547540188, "sampling/sampling_logp_difference/max": 3.9772682189941406, "sampling/sampling_logp_difference/mean": 0.017709840089082718, "step": 82, "step_time": 554.2226089162286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14717.0, "completions/max_terminated_length": 14717.0, "completions/mean_length": 10310.9375, "completions/mean_terminated_length": 10310.9375, "completions/min_length": 6474.0, "completions/min_terminated_length": 6474.0, "entropy": 0.2876297067850828, "epoch": 0.6747967479674797, "frac_reward_zero_std": 0.0, "grad_norm": 0.38776862621307373, "learning_rate": 6.72e-07, "loss": 0.0177, "num_tokens": 28721708.0, "reward": 1.4957520961761475, "reward_std": 1.590112566947937, "rewards/reward_func/mean": 1.4957520961761475, "rewards/reward_func/std": 1.5901126861572266, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.842559576034546, "sampling/importance_sampling_ratio/mean": 0.9902292490005493, "sampling/importance_sampling_ratio/min": 6.962251973163802e-06, "sampling/sampling_logp_difference/max": 11.875007629394531, "sampling/sampling_logp_difference/mean": 0.020037703216075897, "step": 83, "step_time": 359.8615084751509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14412.0, "completions/max_terminated_length": 14412.0, "completions/mean_length": 9628.71875, "completions/mean_terminated_length": 9628.71875, "completions/min_length": 5146.0, "completions/min_terminated_length": 5146.0, "entropy": 0.2873276565223932, "epoch": 0.6829268292682927, "frac_reward_zero_std": 0.0, "grad_norm": 0.40790072083473206, "learning_rate": 6.68e-07, "loss": 0.0, "num_tokens": 29047491.0, "reward": 1.1052613258361816, "reward_std": 0.8609189987182617, "rewards/reward_func/mean": 1.1052613258361816, "rewards/reward_func/std": 0.8609189987182617, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2451746463775635, "sampling/importance_sampling_ratio/mean": 0.9900080561637878, "sampling/importance_sampling_ratio/min": 0.3554363250732422, "sampling/sampling_logp_difference/max": 1.0344091653823853, "sampling/sampling_logp_difference/mean": 0.020314738154411316, "step": 84, "step_time": 354.82466936972924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12796.0, "completions/max_terminated_length": 12796.0, "completions/mean_length": 9634.09375, "completions/mean_terminated_length": 9634.09375, "completions/min_length": 5843.0, "completions/min_terminated_length": 5843.0, "entropy": 0.29404681362211704, "epoch": 0.6910569105691057, "frac_reward_zero_std": 0.25, "grad_norm": 0.4186258018016815, "learning_rate": 6.64e-07, "loss": -0.0343, "num_tokens": 29372486.0, "reward": 0.45844361186027527, "reward_std": 0.5968901515007019, "rewards/reward_func/mean": 0.45844361186027527, "rewards/reward_func/std": 0.5968901515007019, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8098247051239014, "sampling/importance_sampling_ratio/mean": 0.9896739721298218, "sampling/importance_sampling_ratio/min": 2.6880831782705172e-08, "sampling/sampling_logp_difference/max": 17.431852340698242, "sampling/sampling_logp_difference/mean": 0.020807670429348946, "step": 85, "step_time": 520.7830489559565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14586.0, "completions/max_terminated_length": 14586.0, "completions/mean_length": 10420.28125, "completions/mean_terminated_length": 10420.28125, "completions/min_length": 6195.0, "completions/min_terminated_length": 6195.0, "entropy": 0.27361532766371965, "epoch": 0.6991869918699187, "frac_reward_zero_std": 0.0, "grad_norm": 0.4719882607460022, "learning_rate": 6.6e-07, "loss": 0.0101, "num_tokens": 29725231.0, "reward": 0.7226468920707703, "reward_std": 0.8586759567260742, "rewards/reward_func/mean": 0.7226468920707703, "rewards/reward_func/std": 0.8586759567260742, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9905459880828857, "sampling/importance_sampling_ratio/min": 0.004422293510288, "sampling/sampling_logp_difference/max": 5.4210968017578125, "sampling/sampling_logp_difference/mean": 0.019551947712898254, "step": 86, "step_time": 431.56240568542853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15442.0, "completions/max_terminated_length": 15442.0, "completions/mean_length": 9467.71875, "completions/mean_terminated_length": 9467.71875, "completions/min_length": 4182.0, "completions/min_terminated_length": 4182.0, "entropy": 0.2771501960232854, "epoch": 0.7073170731707317, "frac_reward_zero_std": 0.25, "grad_norm": 0.373221218585968, "learning_rate": 6.56e-07, "loss": 0.0111, "num_tokens": 30044670.0, "reward": 0.8195620179176331, "reward_std": 0.6958929300308228, "rewards/reward_func/mean": 0.8195620179176331, "rewards/reward_func/std": 0.6958929300308228, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902039766311646, "sampling/importance_sampling_ratio/min": 0.2062566578388214, "sampling/sampling_logp_difference/max": 1.5786339044570923, "sampling/sampling_logp_difference/mean": 0.020228460431098938, "step": 87, "step_time": 379.2372320953291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13434.0, "completions/max_terminated_length": 13434.0, "completions/mean_length": 7646.875, "completions/mean_terminated_length": 7646.875, "completions/min_length": 2072.0, "completions/min_terminated_length": 2072.0, "entropy": 0.29658396914601326, "epoch": 0.7154471544715447, "frac_reward_zero_std": 0.0, "grad_norm": 0.4571278393268585, "learning_rate": 6.52e-07, "loss": -0.015, "num_tokens": 30299066.0, "reward": 1.1921298503875732, "reward_std": 0.7886288166046143, "rewards/reward_func/mean": 1.1921298503875732, "rewards/reward_func/std": 0.7886288166046143, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9895877838134766, "sampling/importance_sampling_ratio/min": 0.07900335639715195, "sampling/sampling_logp_difference/max": 2.5382649898529053, "sampling/sampling_logp_difference/mean": 0.021227367222309113, "step": 88, "step_time": 305.5276520336047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17540.0, "completions/max_terminated_length": 17540.0, "completions/mean_length": 11489.90625, "completions/mean_terminated_length": 11489.90625, "completions/min_length": 5587.0, "completions/min_terminated_length": 5587.0, "entropy": 0.2844330407679081, "epoch": 0.7235772357723578, "frac_reward_zero_std": 0.0, "grad_norm": 0.3779417872428894, "learning_rate": 6.48e-07, "loss": 0.0617, "num_tokens": 30689791.0, "reward": 0.8213021755218506, "reward_std": 0.6814008355140686, "rewards/reward_func/mean": 0.8301217555999756, "rewards/reward_func/std": 0.6683583855628967, "rewards/soft_overlong_punishment_reward/mean": -0.008819580078125, "rewards/soft_overlong_punishment_reward/std": 0.04989107698202133, "sampling/importance_sampling_ratio/max": 2.003089189529419, "sampling/importance_sampling_ratio/mean": 0.9900344014167786, "sampling/importance_sampling_ratio/min": 0.00883798860013485, "sampling/sampling_logp_difference/max": 4.728695869445801, "sampling/sampling_logp_difference/mean": 0.020483214408159256, "step": 89, "step_time": 492.1519634043798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15601.0, "completions/max_terminated_length": 15601.0, "completions/mean_length": 9316.0, "completions/mean_terminated_length": 9316.0, "completions/min_length": 4857.0, "completions/min_terminated_length": 4857.0, "entropy": 0.28548908419907093, "epoch": 0.7317073170731707, "frac_reward_zero_std": 0.0, "grad_norm": 0.40802574157714844, "learning_rate": 6.44e-07, "loss": 0.0538, "num_tokens": 31001015.0, "reward": 0.9615650177001953, "reward_std": 0.8863016366958618, "rewards/reward_func/mean": 0.9615650177001953, "rewards/reward_func/std": 0.8863016366958618, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.969780683517456, "sampling/importance_sampling_ratio/mean": 0.990296483039856, "sampling/importance_sampling_ratio/min": 0.0010702904546633363, "sampling/sampling_logp_difference/max": 6.83982515335083, "sampling/sampling_logp_difference/mean": 0.020029699429869652, "step": 90, "step_time": 341.7149986529257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15982.0, "completions/max_terminated_length": 15982.0, "completions/mean_length": 10284.1875, "completions/mean_terminated_length": 10284.1875, "completions/min_length": 6201.0, "completions/min_terminated_length": 6201.0, "entropy": 0.2762817107141018, "epoch": 0.7398373983739838, "frac_reward_zero_std": 0.0, "grad_norm": 0.3966502249240875, "learning_rate": 6.4e-07, "loss": -0.0541, "num_tokens": 31353965.0, "reward": 0.6544884443283081, "reward_std": 0.5549855828285217, "rewards/reward_func/mean": 0.6544884443283081, "rewards/reward_func/std": 0.5549855828285217, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.07761287689209, "sampling/importance_sampling_ratio/mean": 0.9903784990310669, "sampling/importance_sampling_ratio/min": 0.028766363859176636, "sampling/sampling_logp_difference/max": 3.548548460006714, "sampling/sampling_logp_difference/mean": 0.019771484658122063, "step": 91, "step_time": 365.1295900817495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12146.0, "completions/max_terminated_length": 12146.0, "completions/mean_length": 8731.96875, "completions/mean_terminated_length": 8731.96875, "completions/min_length": 6007.0, "completions/min_terminated_length": 6007.0, "entropy": 0.27454613894224167, "epoch": 0.7479674796747967, "frac_reward_zero_std": 0.0, "grad_norm": 0.4091607332229614, "learning_rate": 6.36e-07, "loss": 0.0174, "num_tokens": 31652684.0, "reward": 1.158060073852539, "reward_std": 1.1567602157592773, "rewards/reward_func/mean": 1.158060073852539, "rewards/reward_func/std": 1.1567600965499878, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.083181858062744, "sampling/importance_sampling_ratio/mean": 0.9904876947402954, "sampling/importance_sampling_ratio/min": 0.17917752265930176, "sampling/sampling_logp_difference/max": 1.7193782329559326, "sampling/sampling_logp_difference/mean": 0.019298098981380463, "step": 92, "step_time": 288.10143864457496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14462.0, "completions/max_terminated_length": 14462.0, "completions/mean_length": 10936.0625, "completions/mean_terminated_length": 10936.0625, "completions/min_length": 7082.0, "completions/min_terminated_length": 7082.0, "entropy": 0.25290792621672153, "epoch": 0.7560975609756098, "frac_reward_zero_std": 0.0, "grad_norm": 0.3430319130420685, "learning_rate": 6.319999999999999e-07, "loss": 0.0229, "num_tokens": 32026262.0, "reward": 0.730975866317749, "reward_std": 1.1369214057922363, "rewards/reward_func/mean": 0.730975866317749, "rewards/reward_func/std": 1.1369214057922363, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5900559425354004, "sampling/importance_sampling_ratio/mean": 0.9913033843040466, "sampling/importance_sampling_ratio/min": 0.24479100108146667, "sampling/sampling_logp_difference/max": 1.4073505401611328, "sampling/sampling_logp_difference/mean": 0.018287766724824905, "step": 93, "step_time": 343.9439448376652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12133.0, "completions/max_terminated_length": 12133.0, "completions/mean_length": 9534.03125, "completions/mean_terminated_length": 9534.03125, "completions/min_length": 6151.0, "completions/min_terminated_length": 6151.0, "entropy": 0.2943458501249552, "epoch": 0.7642276422764228, "frac_reward_zero_std": 0.0, "grad_norm": 0.45476409792900085, "learning_rate": 6.28e-07, "loss": -0.0413, "num_tokens": 32348031.0, "reward": 0.7935033440589905, "reward_std": 0.5916282534599304, "rewards/reward_func/mean": 0.7935033440589905, "rewards/reward_func/std": 0.5916281938552856, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3106529712677, "sampling/importance_sampling_ratio/mean": 0.9898567795753479, "sampling/importance_sampling_ratio/min": 0.35712242126464844, "sampling/sampling_logp_difference/max": 1.0296766757965088, "sampling/sampling_logp_difference/mean": 0.020504558458924294, "step": 94, "step_time": 310.55569249019027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13674.0, "completions/max_terminated_length": 13674.0, "completions/mean_length": 11145.9375, "completions/mean_terminated_length": 11145.9375, "completions/min_length": 7647.0, "completions/min_terminated_length": 7647.0, "entropy": 0.25408192910254, "epoch": 0.7723577235772358, "frac_reward_zero_std": 0.0, "grad_norm": 0.3503531813621521, "learning_rate": 6.24e-07, "loss": -0.0358, "num_tokens": 32743661.0, "reward": 1.0657795667648315, "reward_std": 0.5564767122268677, "rewards/reward_func/mean": 1.0657795667648315, "rewards/reward_func/std": 0.5564767122268677, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9909650087356567, "sampling/importance_sampling_ratio/min": 0.024863438680768013, "sampling/sampling_logp_difference/max": 3.694356918334961, "sampling/sampling_logp_difference/mean": 0.018509428948163986, "step": 95, "step_time": 504.7223396820482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12207.0, "completions/max_terminated_length": 12207.0, "completions/mean_length": 7209.34375, "completions/mean_terminated_length": 7209.34375, "completions/min_length": 3655.0, "completions/min_terminated_length": 3655.0, "entropy": 0.2981985919177532, "epoch": 0.7804878048780488, "frac_reward_zero_std": 0.25, "grad_norm": 0.4046306014060974, "learning_rate": 6.2e-07, "loss": 0.0208, "num_tokens": 32986864.0, "reward": 1.0635581016540527, "reward_std": 0.6817582249641418, "rewards/reward_func/mean": 1.0635581016540527, "rewards/reward_func/std": 0.6817582845687866, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9894931316375732, "sampling/importance_sampling_ratio/min": 0.20488663017749786, "sampling/sampling_logp_difference/max": 1.5852985382080078, "sampling/sampling_logp_difference/mean": 0.020824309438467026, "step": 96, "step_time": 286.0661023929715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14730.0, "completions/max_terminated_length": 14730.0, "completions/mean_length": 10380.71875, "completions/mean_terminated_length": 10380.71875, "completions/min_length": 5851.0, "completions/min_terminated_length": 5851.0, "entropy": 0.27193080354481936, "epoch": 0.7886178861788617, "frac_reward_zero_std": 0.25, "grad_norm": 0.32370734214782715, "learning_rate": 6.16e-07, "loss": 0.0166, "num_tokens": 33339135.0, "reward": 0.4628852605819702, "reward_std": 0.697974443435669, "rewards/reward_func/mean": 0.4628852605819702, "rewards/reward_func/std": 0.6979743838310242, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9995118379592896, "sampling/importance_sampling_ratio/mean": 0.990587592124939, "sampling/importance_sampling_ratio/min": 0.3680788576602936, "sampling/sampling_logp_difference/max": 0.9994580745697021, "sampling/sampling_logp_difference/mean": 0.019620057195425034, "step": 97, "step_time": 386.9268953008577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17358.0, "completions/max_terminated_length": 17358.0, "completions/mean_length": 10972.78125, "completions/mean_terminated_length": 10972.78125, "completions/min_length": 6301.0, "completions/min_terminated_length": 6301.0, "entropy": 0.2722749076783657, "epoch": 0.7967479674796748, "frac_reward_zero_std": 0.0, "grad_norm": 0.38124802708625793, "learning_rate": 6.119999999999999e-07, "loss": -0.0324, "num_tokens": 33703832.0, "reward": 3.1216416358947754, "reward_std": 4.944037437438965, "rewards/reward_func/mean": 3.129072666168213, "rewards/reward_func/std": 4.940955638885498, "rewards/soft_overlong_punishment_reward/mean": -0.0074310302734375, "rewards/soft_overlong_punishment_reward/std": 0.042036253958940506, "sampling/importance_sampling_ratio/max": 2.931342124938965, "sampling/importance_sampling_ratio/mean": 0.9906518459320068, "sampling/importance_sampling_ratio/min": 0.14344006776809692, "sampling/sampling_logp_difference/max": 1.941838026046753, "sampling/sampling_logp_difference/mean": 0.01964397355914116, "step": 98, "step_time": 387.24955694633536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17372.0, "completions/max_terminated_length": 17372.0, "completions/mean_length": 11532.4375, "completions/mean_terminated_length": 11532.4375, "completions/min_length": 8395.0, "completions/min_terminated_length": 8395.0, "entropy": 0.2691387142986059, "epoch": 0.8048780487804879, "frac_reward_zero_std": 0.0, "grad_norm": 0.3799015283584595, "learning_rate": 6.079999999999999e-07, "loss": 0.0257, "num_tokens": 34090550.0, "reward": 0.8006877899169922, "reward_std": 0.6781271696090698, "rewards/reward_func/mean": 0.8082256317138672, "rewards/reward_func/std": 0.6674283742904663, "rewards/soft_overlong_punishment_reward/mean": -0.007537841796875, "rewards/soft_overlong_punishment_reward/std": 0.04264046996831894, "sampling/importance_sampling_ratio/max": 2.300957202911377, "sampling/importance_sampling_ratio/mean": 0.9906717538833618, "sampling/importance_sampling_ratio/min": 0.20086948573589325, "sampling/sampling_logp_difference/max": 1.6050999164581299, "sampling/sampling_logp_difference/mean": 0.019317321479320526, "step": 99, "step_time": 392.5223165056668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12911.0, "completions/max_terminated_length": 12911.0, "completions/mean_length": 9746.75, "completions/mean_terminated_length": 9746.75, "completions/min_length": 6072.0, "completions/min_terminated_length": 6072.0, "entropy": 0.2797823455184698, "epoch": 0.8130081300813008, "frac_reward_zero_std": 0.0, "grad_norm": 0.40460583567619324, "learning_rate": 6.04e-07, "loss": 0.029, "num_tokens": 34424134.0, "reward": 0.8509798049926758, "reward_std": 0.6565719842910767, "rewards/reward_func/mean": 0.8509798049926758, "rewards/reward_func/std": 0.6565720438957214, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1567296981811523, "sampling/importance_sampling_ratio/mean": 0.9903455376625061, "sampling/importance_sampling_ratio/min": 0.04983352869749069, "sampling/sampling_logp_difference/max": 2.9990673065185547, "sampling/sampling_logp_difference/mean": 0.020120201632380486, "step": 100, "step_time": 411.4386176103726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14230.0, "completions/max_terminated_length": 14230.0, "completions/mean_length": 9372.625, "completions/mean_terminated_length": 9372.625, "completions/min_length": 6478.0, "completions/min_terminated_length": 6478.0, "entropy": 0.2752824854105711, "epoch": 0.8211382113821138, "frac_reward_zero_std": 0.0, "grad_norm": 0.42639046907424927, "learning_rate": 6e-07, "loss": 0.0453, "num_tokens": 34742706.0, "reward": 0.8862287998199463, "reward_std": 0.5651472806930542, "rewards/reward_func/mean": 0.8862287998199463, "rewards/reward_func/std": 0.5651472806930542, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2520346641540527, "sampling/importance_sampling_ratio/mean": 0.9904347062110901, "sampling/importance_sampling_ratio/min": 0.2951987087726593, "sampling/sampling_logp_difference/max": 1.220106601715088, "sampling/sampling_logp_difference/mean": 0.019315559417009354, "step": 101, "step_time": 487.11677971179597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14193.0, "completions/max_terminated_length": 14193.0, "completions/mean_length": 9737.625, "completions/mean_terminated_length": 9737.625, "completions/min_length": 6748.0, "completions/min_terminated_length": 6748.0, "entropy": 0.26770962681621313, "epoch": 0.8292682926829268, "frac_reward_zero_std": 0.0, "grad_norm": 0.40013840794563293, "learning_rate": 5.96e-07, "loss": 0.0546, "num_tokens": 35072926.0, "reward": 0.8616600036621094, "reward_std": 0.6439077854156494, "rewards/reward_func/mean": 0.8616600036621094, "rewards/reward_func/std": 0.6439077258110046, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9906123876571655, "sampling/importance_sampling_ratio/min": 0.08764204382896423, "sampling/sampling_logp_difference/max": 2.4344944953918457, "sampling/sampling_logp_difference/mean": 0.019352490082383156, "step": 102, "step_time": 527.143631025916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13535.0, "completions/max_terminated_length": 13535.0, "completions/mean_length": 7765.5, "completions/mean_terminated_length": 7765.5, "completions/min_length": 2947.0, "completions/min_terminated_length": 2947.0, "entropy": 0.2951889578253031, "epoch": 0.8373983739837398, "frac_reward_zero_std": 0.0, "grad_norm": 0.438553124666214, "learning_rate": 5.919999999999999e-07, "loss": 0.0248, "num_tokens": 35336830.0, "reward": 1.7072114944458008, "reward_std": 1.5981085300445557, "rewards/reward_func/mean": 1.7072114944458008, "rewards/reward_func/std": 1.5981085300445557, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.115924119949341, "sampling/importance_sampling_ratio/mean": 0.9898935556411743, "sampling/importance_sampling_ratio/min": 7.95746746007353e-06, "sampling/sampling_logp_difference/max": 11.741399765014648, "sampling/sampling_logp_difference/mean": 0.020254164934158325, "step": 103, "step_time": 370.2516266454477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14842.0, "completions/max_terminated_length": 14842.0, "completions/mean_length": 10355.96875, "completions/mean_terminated_length": 10355.96875, "completions/min_length": 5380.0, "completions/min_terminated_length": 5380.0, "entropy": 0.27049592323601246, "epoch": 0.8455284552845529, "frac_reward_zero_std": 0.0, "grad_norm": 0.4156341850757599, "learning_rate": 5.879999999999999e-07, "loss": 0.0127, "num_tokens": 35693509.0, "reward": 0.41826581954956055, "reward_std": 0.4194912612438202, "rewards/reward_func/mean": 0.41826581954956055, "rewards/reward_func/std": 0.4194912910461426, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.567049264907837, "sampling/importance_sampling_ratio/mean": 0.9908696413040161, "sampling/importance_sampling_ratio/min": 0.14091628789901733, "sampling/sampling_logp_difference/max": 1.9595892429351807, "sampling/sampling_logp_difference/mean": 0.01946902647614479, "step": 104, "step_time": 353.2303258762695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13809.0, "completions/max_terminated_length": 13809.0, "completions/mean_length": 9495.78125, "completions/mean_terminated_length": 9495.78125, "completions/min_length": 4356.0, "completions/min_terminated_length": 4356.0, "entropy": 0.2733620647341013, "epoch": 0.8536585365853658, "frac_reward_zero_std": 0.0, "grad_norm": 0.4369182586669922, "learning_rate": 5.839999999999999e-07, "loss": -0.0393, "num_tokens": 36021462.0, "reward": 1.382319450378418, "reward_std": 1.3517597913742065, "rewards/reward_func/mean": 1.382319450378418, "rewards/reward_func/std": 1.351759672164917, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902534484863281, "sampling/importance_sampling_ratio/min": 0.12755648791790009, "sampling/sampling_logp_difference/max": 2.0591959953308105, "sampling/sampling_logp_difference/mean": 0.019751761108636856, "step": 105, "step_time": 376.45426351577044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16323.0, "completions/max_terminated_length": 16323.0, "completions/mean_length": 11590.21875, "completions/mean_terminated_length": 11590.21875, "completions/min_length": 7957.0, "completions/min_terminated_length": 7957.0, "entropy": 0.26938960794359446, "epoch": 0.8617886178861789, "frac_reward_zero_std": 0.0, "grad_norm": 0.3502255082130432, "learning_rate": 5.8e-07, "loss": -0.0654, "num_tokens": 36413573.0, "reward": 1.1353490352630615, "reward_std": 0.9817598462104797, "rewards/reward_func/mean": 1.1353490352630615, "rewards/reward_func/std": 0.9817598462104797, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.990548849105835, "sampling/importance_sampling_ratio/min": 0.16546453535556793, "sampling/sampling_logp_difference/max": 1.7989983558654785, "sampling/sampling_logp_difference/mean": 0.0195661298930645, "step": 106, "step_time": 459.41654721833766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12627.0, "completions/max_terminated_length": 12627.0, "completions/mean_length": 10500.6875, "completions/mean_terminated_length": 10500.6875, "completions/min_length": 7936.0, "completions/min_terminated_length": 7936.0, "entropy": 0.26135144662112, "epoch": 0.8699186991869918, "frac_reward_zero_std": 0.0, "grad_norm": 0.3937532901763916, "learning_rate": 5.76e-07, "loss": -0.0177, "num_tokens": 36769739.0, "reward": 0.8728915452957153, "reward_std": 0.605887770652771, "rewards/reward_func/mean": 0.8728915452957153, "rewards/reward_func/std": 0.6058877110481262, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5900862216949463, "sampling/importance_sampling_ratio/mean": 0.9908682107925415, "sampling/importance_sampling_ratio/min": 0.16999557614326477, "sampling/sampling_logp_difference/max": 1.7719829082489014, "sampling/sampling_logp_difference/mean": 0.018946364521980286, "step": 107, "step_time": 383.9537097290158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13026.0, "completions/max_terminated_length": 13026.0, "completions/mean_length": 8727.03125, "completions/mean_terminated_length": 8727.03125, "completions/min_length": 3240.0, "completions/min_terminated_length": 3240.0, "entropy": 0.28625404462218285, "epoch": 0.8780487804878049, "frac_reward_zero_std": 0.0, "grad_norm": 0.4003085792064667, "learning_rate": 5.719999999999999e-07, "loss": -0.0317, "num_tokens": 37067684.0, "reward": 0.8368265628814697, "reward_std": 0.700191855430603, "rewards/reward_func/mean": 0.8368265628814697, "rewards/reward_func/std": 0.7001917958259583, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3381805419921875, "sampling/importance_sampling_ratio/mean": 0.990260124206543, "sampling/importance_sampling_ratio/min": 0.3622949719429016, "sampling/sampling_logp_difference/max": 1.0152965784072876, "sampling/sampling_logp_difference/mean": 0.01966564916074276, "step": 108, "step_time": 342.13271795446053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15831.0, "completions/max_terminated_length": 15831.0, "completions/mean_length": 9779.71875, "completions/mean_terminated_length": 9779.71875, "completions/min_length": 4923.0, "completions/min_terminated_length": 4923.0, "entropy": 0.2904251739382744, "epoch": 0.8861788617886179, "frac_reward_zero_std": 0.0, "grad_norm": 0.3797857463359833, "learning_rate": 5.679999999999999e-07, "loss": -0.0324, "num_tokens": 37391699.0, "reward": 0.8686186671257019, "reward_std": 1.0430734157562256, "rewards/reward_func/mean": 0.8686186671257019, "rewards/reward_func/std": 1.0430734157562256, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.62477970123291, "sampling/importance_sampling_ratio/mean": 0.9900540113449097, "sampling/importance_sampling_ratio/min": 0.09236539900302887, "sampling/sampling_logp_difference/max": 2.382002830505371, "sampling/sampling_logp_difference/mean": 0.02043168991804123, "step": 109, "step_time": 490.61023391690105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16567.0, "completions/max_terminated_length": 16567.0, "completions/mean_length": 10196.96875, "completions/mean_terminated_length": 10196.96875, "completions/min_length": 3045.0, "completions/min_terminated_length": 3045.0, "entropy": 0.27857582084834576, "epoch": 0.8943089430894309, "frac_reward_zero_std": 0.0, "grad_norm": 0.6588975191116333, "learning_rate": 5.639999999999999e-07, "loss": 0.0584, "num_tokens": 37735962.0, "reward": 0.9283621907234192, "reward_std": 0.6262274384498596, "rewards/reward_func/mean": 0.9300482869148254, "rewards/reward_func/std": 0.6267218589782715, "rewards/soft_overlong_punishment_reward/mean": -0.00168609619140625, "rewards/soft_overlong_punishment_reward/std": 0.008014494553208351, "sampling/importance_sampling_ratio/max": 2.244232654571533, "sampling/importance_sampling_ratio/mean": 0.9905369281768799, "sampling/importance_sampling_ratio/min": 0.13296444714069366, "sampling/sampling_logp_difference/max": 2.0176734924316406, "sampling/sampling_logp_difference/mean": 0.019462019205093384, "step": 110, "step_time": 402.9467481090687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15759.0, "completions/max_terminated_length": 15759.0, "completions/mean_length": 10804.0625, "completions/mean_terminated_length": 10804.0625, "completions/min_length": 7114.0, "completions/min_terminated_length": 7114.0, "entropy": 0.2655584989115596, "epoch": 0.9024390243902439, "frac_reward_zero_std": 0.0, "grad_norm": 0.3911796808242798, "learning_rate": 5.6e-07, "loss": 0.0368, "num_tokens": 38108716.0, "reward": 0.9208776950836182, "reward_std": 0.6002487540245056, "rewards/reward_func/mean": 0.9208776950836182, "rewards/reward_func/std": 0.6002486944198608, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9908932447433472, "sampling/importance_sampling_ratio/min": 0.11072701215744019, "sampling/sampling_logp_difference/max": 2.2006874084472656, "sampling/sampling_logp_difference/mean": 0.018912389874458313, "step": 111, "step_time": 407.13406765437685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12699.0, "completions/max_terminated_length": 12699.0, "completions/mean_length": 9181.875, "completions/mean_terminated_length": 9181.875, "completions/min_length": 5314.0, "completions/min_terminated_length": 5314.0, "entropy": 0.30285687930881977, "epoch": 0.9105691056910569, "frac_reward_zero_std": 0.0, "grad_norm": 0.3971967101097107, "learning_rate": 5.560000000000001e-07, "loss": 0.0194, "num_tokens": 38414432.0, "reward": 0.7858570218086243, "reward_std": 0.641828179359436, "rewards/reward_func/mean": 0.7858570218086243, "rewards/reward_func/std": 0.641828179359436, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0928192138671875, "sampling/importance_sampling_ratio/mean": 0.989500105381012, "sampling/importance_sampling_ratio/min": 0.3735429346561432, "sampling/sampling_logp_difference/max": 0.984722375869751, "sampling/sampling_logp_difference/mean": 0.021352462470531464, "step": 112, "step_time": 310.6167916948907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12846.0, "completions/max_terminated_length": 12846.0, "completions/mean_length": 9087.71875, "completions/mean_terminated_length": 9087.71875, "completions/min_length": 2983.0, "completions/min_terminated_length": 2983.0, "entropy": 0.2819946352392435, "epoch": 0.9186991869918699, "frac_reward_zero_std": 0.25, "grad_norm": 0.3225323557853699, "learning_rate": 5.520000000000001e-07, "loss": -0.0178, "num_tokens": 38724663.0, "reward": 1.5539352893829346, "reward_std": 1.4159839153289795, "rewards/reward_func/mean": 1.5539352893829346, "rewards/reward_func/std": 1.4159839153289795, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9653639793395996, "sampling/importance_sampling_ratio/mean": 0.9901857376098633, "sampling/importance_sampling_ratio/min": 0.2668585479259491, "sampling/sampling_logp_difference/max": 1.3210365772247314, "sampling/sampling_logp_difference/mean": 0.019712798297405243, "step": 113, "step_time": 342.8124888394959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13271.0, "completions/max_terminated_length": 13271.0, "completions/mean_length": 7762.90625, "completions/mean_terminated_length": 7762.90625, "completions/min_length": 3438.0, "completions/min_terminated_length": 3438.0, "entropy": 0.3000789564102888, "epoch": 0.926829268292683, "frac_reward_zero_std": 0.0, "grad_norm": 0.4473620355129242, "learning_rate": 5.48e-07, "loss": 0.0414, "num_tokens": 38988212.0, "reward": 1.8438094854354858, "reward_std": 1.0651018619537354, "rewards/reward_func/mean": 1.8438094854354858, "rewards/reward_func/std": 1.065101981163025, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1569113731384277, "sampling/importance_sampling_ratio/mean": 0.989720344543457, "sampling/importance_sampling_ratio/min": 0.4092179238796234, "sampling/sampling_logp_difference/max": 0.8935074806213379, "sampling/sampling_logp_difference/mean": 0.020644046366214752, "step": 114, "step_time": 314.952937441878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16884.0, "completions/max_terminated_length": 16884.0, "completions/mean_length": 9699.75, "completions/mean_terminated_length": 9699.75, "completions/min_length": 4351.0, "completions/min_terminated_length": 4351.0, "entropy": 0.2816589046269655, "epoch": 0.9349593495934959, "frac_reward_zero_std": 0.0, "grad_norm": 0.3920203149318695, "learning_rate": 5.44e-07, "loss": 0.0309, "num_tokens": 39317620.0, "reward": 1.1251944303512573, "reward_std": 0.5114659667015076, "rewards/reward_func/mean": 1.1298178434371948, "rewards/reward_func/std": 0.5025987029075623, "rewards/soft_overlong_punishment_reward/mean": -0.0046234130859375, "rewards/soft_overlong_punishment_reward/std": 0.021913941949605942, "sampling/importance_sampling_ratio/max": 2.1895768642425537, "sampling/importance_sampling_ratio/mean": 0.990413248538971, "sampling/importance_sampling_ratio/min": 0.036309029906988144, "sampling/sampling_logp_difference/max": 3.3156888484954834, "sampling/sampling_logp_difference/mean": 0.019514720886945724, "step": 115, "step_time": 596.8481385947671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13522.0, "completions/max_terminated_length": 13522.0, "completions/mean_length": 10156.0, "completions/mean_terminated_length": 10156.0, "completions/min_length": 8118.0, "completions/min_terminated_length": 8118.0, "entropy": 0.2807458247989416, "epoch": 0.943089430894309, "frac_reward_zero_std": 0.0, "grad_norm": 0.39652687311172485, "learning_rate": 5.4e-07, "loss": 0.0079, "num_tokens": 39666956.0, "reward": 1.032971739768982, "reward_std": 0.5176628232002258, "rewards/reward_func/mean": 1.032971739768982, "rewards/reward_func/std": 0.5176628232002258, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.300656318664551, "sampling/importance_sampling_ratio/mean": 0.9902166128158569, "sampling/importance_sampling_ratio/min": 0.06670020520687103, "sampling/sampling_logp_difference/max": 2.707547187805176, "sampling/sampling_logp_difference/mean": 0.020113978534936905, "step": 116, "step_time": 329.7457778744865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18599.0, "completions/max_terminated_length": 18599.0, "completions/mean_length": 10023.34375, "completions/mean_terminated_length": 10023.34375, "completions/min_length": 3017.0, "completions/min_terminated_length": 3017.0, "entropy": 0.29004107043147087, "epoch": 0.9512195121951219, "frac_reward_zero_std": 0.0, "grad_norm": 0.3823752701282501, "learning_rate": 5.36e-07, "loss": 0.025, "num_tokens": 40002903.0, "reward": 1.0167418718338013, "reward_std": 0.5728766322135925, "rewards/reward_func/mean": 1.03364098072052, "rewards/reward_func/std": 0.5336021780967712, "rewards/soft_overlong_punishment_reward/mean": -0.01689910888671875, "rewards/soft_overlong_punishment_reward/std": 0.09559579938650131, "sampling/importance_sampling_ratio/max": 2.599911689758301, "sampling/importance_sampling_ratio/mean": 0.9898045063018799, "sampling/importance_sampling_ratio/min": 0.2843405604362488, "sampling/sampling_logp_difference/max": 1.257582664489746, "sampling/sampling_logp_difference/mean": 0.020362723618745804, "step": 117, "step_time": 400.2332091778517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20165.0, "completions/max_terminated_length": 20165.0, "completions/mean_length": 11463.84375, "completions/mean_terminated_length": 11463.84375, "completions/min_length": 6103.0, "completions/min_terminated_length": 6103.0, "entropy": 0.24839603807777166, "epoch": 0.959349593495935, "frac_reward_zero_std": 0.0, "grad_norm": 0.36443084478378296, "learning_rate": 5.32e-07, "loss": 0.0414, "num_tokens": 40388106.0, "reward": 1.3802266120910645, "reward_std": 1.3856797218322754, "rewards/reward_func/mean": 1.4090733528137207, "rewards/reward_func/std": 1.3463064432144165, "rewards/soft_overlong_punishment_reward/mean": -0.02884674072265625, "rewards/soft_overlong_punishment_reward/std": 0.16318179666996002, "sampling/importance_sampling_ratio/max": 2.2451746463775635, "sampling/importance_sampling_ratio/mean": 0.9913095235824585, "sampling/importance_sampling_ratio/min": 0.05133594200015068, "sampling/sampling_logp_difference/max": 2.9693641662597656, "sampling/sampling_logp_difference/mean": 0.018298882991075516, "step": 118, "step_time": 442.66103106434457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11502.0, "completions/max_terminated_length": 11502.0, "completions/mean_length": 9009.84375, "completions/mean_terminated_length": 9009.84375, "completions/min_length": 5196.0, "completions/min_terminated_length": 5196.0, "entropy": 0.3013112135231495, "epoch": 0.967479674796748, "frac_reward_zero_std": 0.0, "grad_norm": 0.4052444398403168, "learning_rate": 5.28e-07, "loss": 0.0246, "num_tokens": 40690197.0, "reward": 2.287106513977051, "reward_std": 3.716765880584717, "rewards/reward_func/mean": 2.287106513977051, "rewards/reward_func/std": 3.7167656421661377, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6317532062530518, "sampling/importance_sampling_ratio/mean": 0.989405632019043, "sampling/importance_sampling_ratio/min": 0.2487541288137436, "sampling/sampling_logp_difference/max": 1.391290307044983, "sampling/sampling_logp_difference/mean": 0.02114209532737732, "step": 119, "step_time": 497.79962878953665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15640.0, "completions/max_terminated_length": 15640.0, "completions/mean_length": 11166.90625, "completions/mean_terminated_length": 11166.90625, "completions/min_length": 7151.0, "completions/min_terminated_length": 7151.0, "entropy": 0.24014894664287567, "epoch": 0.975609756097561, "frac_reward_zero_std": 0.0, "grad_norm": 0.3485146760940552, "learning_rate": 5.24e-07, "loss": -0.0227, "num_tokens": 41079202.0, "reward": 1.5909485816955566, "reward_std": 2.074122667312622, "rewards/reward_func/mean": 1.5909485816955566, "rewards/reward_func/std": 2.074122667312622, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.659359931945801, "sampling/importance_sampling_ratio/mean": 0.991703987121582, "sampling/importance_sampling_ratio/min": 0.27526989579200745, "sampling/sampling_logp_difference/max": 1.2900032997131348, "sampling/sampling_logp_difference/mean": 0.017743926495313644, "step": 120, "step_time": 416.90072300843894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 20480.0, "completions/max_terminated_length": 12728.0, "completions/mean_length": 9105.25, "completions/mean_terminated_length": 8738.322265625, "completions/min_length": 5002.0, "completions/min_terminated_length": 5002.0, "entropy": 0.28143354691565037, "epoch": 0.983739837398374, "frac_reward_zero_std": 0.0, "grad_norm": 0.39125847816467285, "learning_rate": 5.2e-07, "loss": -0.049, "num_tokens": 41386626.0, "reward": 1.0522152185440063, "reward_std": 0.7879756093025208, "rewards/reward_func/mean": 1.1184157133102417, "rewards/reward_func/std": 0.7047606706619263, "rewards/soft_overlong_punishment_reward/mean": -0.03125, "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.605196475982666, "sampling/importance_sampling_ratio/mean": 0.9898760318756104, "sampling/importance_sampling_ratio/min": 0.12439469248056412, "sampling/sampling_logp_difference/max": 2.0842957496643066, "sampling/sampling_logp_difference/mean": 0.020777855068445206, "step": 121, "step_time": 394.79837024072185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13989.0, "completions/max_terminated_length": 13989.0, "completions/mean_length": 9587.40625, "completions/mean_terminated_length": 9587.40625, "completions/min_length": 6042.0, "completions/min_terminated_length": 6042.0, "entropy": 0.2911383733153343, "epoch": 0.991869918699187, "frac_reward_zero_std": 0.0, "grad_norm": 0.40522804856300354, "learning_rate": 5.16e-07, "loss": 0.0389, "num_tokens": 41711359.0, "reward": 2.1636319160461426, "reward_std": 2.066298246383667, "rewards/reward_func/mean": 2.1636319160461426, "rewards/reward_func/std": 2.066298007965088, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1135575771331787, "sampling/importance_sampling_ratio/mean": 0.9899325370788574, "sampling/importance_sampling_ratio/min": 0.3981960415840149, "sampling/sampling_logp_difference/max": 0.9208108186721802, "sampling/sampling_logp_difference/mean": 0.02073751576244831, "step": 122, "step_time": 348.1325803932268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14597.0, "completions/max_terminated_length": 14597.0, "completions/mean_length": 9709.09375, "completions/mean_terminated_length": 9709.09375, "completions/min_length": 4628.0, "completions/min_terminated_length": 4628.0, "entropy": 0.2827394837513566, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.3837268650531769, "learning_rate": 5.12e-07, "loss": 0.0102, "num_tokens": 42045618.0, "reward": 1.2427458763122559, "reward_std": 0.4029768705368042, "rewards/reward_func/mean": 1.2427458763122559, "rewards/reward_func/std": 0.4029768705368042, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902385473251343, "sampling/importance_sampling_ratio/min": 0.26592302322387695, "sampling/sampling_logp_difference/max": 1.5374226570129395, "sampling/sampling_logp_difference/mean": 0.01998906210064888, "step": 123, "step_time": 447.3557674009353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10975.0, "completions/max_terminated_length": 10975.0, "completions/mean_length": 6729.0, "completions/mean_terminated_length": 6729.0, "completions/min_length": 3134.0, "completions/min_terminated_length": 3134.0, "entropy": 0.30727832205593586, "epoch": 1.008130081300813, "frac_reward_zero_std": 0.0, "grad_norm": 0.5060835480690002, "learning_rate": 5.079999999999999e-07, "loss": -0.0037, "num_tokens": 42272234.0, "reward": 2.10992693901062, "reward_std": 1.1074674129486084, "rewards/reward_func/mean": 2.10992693901062, "rewards/reward_func/std": 1.1074674129486084, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.500887155532837, "sampling/importance_sampling_ratio/mean": 0.9892988204956055, "sampling/importance_sampling_ratio/min": 1.925503283928265e-06, "sampling/sampling_logp_difference/max": 13.160323143005371, "sampling/sampling_logp_difference/mean": 0.021586474031209946, "step": 124, "step_time": 272.6258725624066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11174.0, "completions/max_terminated_length": 11174.0, "completions/mean_length": 8467.71875, "completions/mean_terminated_length": 8467.71875, "completions/min_length": 6445.0, "completions/min_terminated_length": 6445.0, "entropy": 0.2736167572438717, "epoch": 1.016260162601626, "frac_reward_zero_std": 0.0, "grad_norm": 0.4135052263736725, "learning_rate": 5.04e-07, "loss": -0.035, "num_tokens": 42569737.0, "reward": 1.3765490055084229, "reward_std": 0.8696650862693787, "rewards/reward_func/mean": 1.3765490055084229, "rewards/reward_func/std": 0.8696651458740234, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3245065212249756, "sampling/importance_sampling_ratio/mean": 0.990541934967041, "sampling/importance_sampling_ratio/min": 0.31107988953590393, "sampling/sampling_logp_difference/max": 1.1677055358886719, "sampling/sampling_logp_difference/mean": 0.019098468124866486, "step": 125, "step_time": 385.7312441198155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11560.0, "completions/max_terminated_length": 11560.0, "completions/mean_length": 8247.96875, "completions/mean_terminated_length": 8247.96875, "completions/min_length": 6429.0, "completions/min_terminated_length": 6429.0, "entropy": 0.30573431588709354, "epoch": 1.024390243902439, "frac_reward_zero_std": 0.0, "grad_norm": 0.6381073594093323, "learning_rate": 5e-07, "loss": -0.0225, "num_tokens": 42846160.0, "reward": 1.529063105583191, "reward_std": 0.6035394668579102, "rewards/reward_func/mean": 1.529063105583191, "rewards/reward_func/std": 0.6035395264625549, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.874021291732788, "sampling/importance_sampling_ratio/mean": 0.9894424676895142, "sampling/importance_sampling_ratio/min": 5.591235822066665e-05, "sampling/sampling_logp_difference/max": 9.791725158691406, "sampling/sampling_logp_difference/mean": 0.021437201648950577, "step": 126, "step_time": 374.45182742597535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16217.0, "completions/max_terminated_length": 16217.0, "completions/mean_length": 9949.71875, "completions/mean_terminated_length": 9949.71875, "completions/min_length": 4652.0, "completions/min_terminated_length": 4652.0, "entropy": 0.31133372336626053, "epoch": 1.032520325203252, "frac_reward_zero_std": 0.0, "grad_norm": 0.39641663432121277, "learning_rate": 4.96e-07, "loss": 0.0177, "num_tokens": 43176799.0, "reward": 1.072230577468872, "reward_std": 0.6142371892929077, "rewards/reward_func/mean": 1.072230577468872, "rewards/reward_func/std": 0.6142371892929077, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1118323802948, "sampling/importance_sampling_ratio/mean": 0.989276647567749, "sampling/importance_sampling_ratio/min": 0.3954460620880127, "sampling/sampling_logp_difference/max": 0.9277408719062805, "sampling/sampling_logp_difference/mean": 0.021818656474351883, "step": 127, "step_time": 381.2957224531565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12746.0, "completions/max_terminated_length": 12746.0, "completions/mean_length": 9191.84375, "completions/mean_terminated_length": 9191.84375, "completions/min_length": 5782.0, "completions/min_terminated_length": 5782.0, "entropy": 0.30142006278038025, "epoch": 1.040650406504065, "frac_reward_zero_std": 0.0, "grad_norm": 0.42082273960113525, "learning_rate": 4.92e-07, "loss": 0.0094, "num_tokens": 43484650.0, "reward": 1.3620415925979614, "reward_std": 1.3542721271514893, "rewards/reward_func/mean": 1.3620415925979614, "rewards/reward_func/std": 1.3542720079421997, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0942885875701904, "sampling/importance_sampling_ratio/mean": 0.9896490573883057, "sampling/importance_sampling_ratio/min": 0.2504059672355652, "sampling/sampling_logp_difference/max": 1.3846718072891235, "sampling/sampling_logp_difference/mean": 0.020984603092074394, "step": 128, "step_time": 440.56851464207284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14303.0, "completions/max_terminated_length": 14303.0, "completions/mean_length": 9928.21875, "completions/mean_terminated_length": 9928.21875, "completions/min_length": 6230.0, "completions/min_terminated_length": 6230.0, "entropy": 0.2877661660313606, "epoch": 1.048780487804878, "frac_reward_zero_std": 0.0, "grad_norm": 0.39021143317222595, "learning_rate": 4.879999999999999e-07, "loss": 0.0127, "num_tokens": 43820457.0, "reward": 0.8574129343032837, "reward_std": 0.572685182094574, "rewards/reward_func/mean": 0.8574129343032837, "rewards/reward_func/std": 0.5726851224899292, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1260764598846436, "sampling/importance_sampling_ratio/mean": 0.9900418519973755, "sampling/importance_sampling_ratio/min": 0.3720153272151947, "sampling/sampling_logp_difference/max": 0.9888203144073486, "sampling/sampling_logp_difference/mean": 0.02001476287841797, "step": 129, "step_time": 541.4112695821095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15507.0, "completions/max_terminated_length": 15507.0, "completions/mean_length": 11027.96875, "completions/mean_terminated_length": 11027.96875, "completions/min_length": 6492.0, "completions/min_terminated_length": 6492.0, "entropy": 0.24912087991833687, "epoch": 1.056910569105691, "frac_reward_zero_std": 0.0, "grad_norm": 0.3552826941013336, "learning_rate": 4.839999999999999e-07, "loss": 0.0134, "num_tokens": 44211528.0, "reward": 0.9818797707557678, "reward_std": 0.6069151759147644, "rewards/reward_func/mean": 0.9818797707557678, "rewards/reward_func/std": 0.6069151163101196, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1495964527130127, "sampling/importance_sampling_ratio/mean": 0.9914515614509583, "sampling/importance_sampling_ratio/min": 0.2990056574344635, "sampling/sampling_logp_difference/max": 1.2072927951812744, "sampling/sampling_logp_difference/mean": 0.017646770924329758, "step": 130, "step_time": 481.262499995064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17006.0, "completions/max_terminated_length": 17006.0, "completions/mean_length": 9569.5, "completions/mean_terminated_length": 9569.5, "completions/min_length": 3066.0, "completions/min_terminated_length": 3066.0, "entropy": 0.27375591546297073, "epoch": 1.065040650406504, "frac_reward_zero_std": 0.0, "grad_norm": 0.40961822867393494, "learning_rate": 4.8e-07, "loss": 0.0289, "num_tokens": 44543752.0, "reward": 1.1588225364685059, "reward_std": 1.0106967687606812, "rewards/reward_func/mean": 1.1635680198669434, "rewards/reward_func/std": 1.0049266815185547, "rewards/soft_overlong_punishment_reward/mean": -0.0047454833984375, "rewards/soft_overlong_punishment_reward/std": 0.026844507083296776, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9909744262695312, "sampling/importance_sampling_ratio/min": 0.08428187668323517, "sampling/sampling_logp_difference/max": 2.473588466644287, "sampling/sampling_logp_difference/mean": 0.01833171769976616, "step": 131, "step_time": 601.2343452193309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14444.0, "completions/max_terminated_length": 14444.0, "completions/mean_length": 11014.25, "completions/mean_terminated_length": 11014.25, "completions/min_length": 8324.0, "completions/min_terminated_length": 8324.0, "entropy": 0.2876242399215698, "epoch": 1.0731707317073171, "frac_reward_zero_std": 0.0, "grad_norm": 0.4003022015094757, "learning_rate": 4.76e-07, "loss": -0.0106, "num_tokens": 44918296.0, "reward": 0.595725953578949, "reward_std": 0.6117147207260132, "rewards/reward_func/mean": 0.595725953578949, "rewards/reward_func/std": 0.6117147207260132, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3590211868286133, "sampling/importance_sampling_ratio/mean": 0.9899647235870361, "sampling/importance_sampling_ratio/min": 0.177188441157341, "sampling/sampling_logp_difference/max": 1.730541467666626, "sampling/sampling_logp_difference/mean": 0.020228557288646698, "step": 132, "step_time": 359.82743262615986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14076.0, "completions/max_terminated_length": 14076.0, "completions/mean_length": 10530.5, "completions/mean_terminated_length": 10530.5, "completions/min_length": 7319.0, "completions/min_terminated_length": 7319.0, "entropy": 0.28902580961585045, "epoch": 1.08130081300813, "frac_reward_zero_std": 0.0, "grad_norm": 0.3696592152118683, "learning_rate": 4.7199999999999994e-07, "loss": 0.0044, "num_tokens": 45273288.0, "reward": 1.0774266719818115, "reward_std": 0.7130836248397827, "rewards/reward_func/mean": 1.0774266719818115, "rewards/reward_func/std": 0.7130836248397827, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9898357391357422, "sampling/importance_sampling_ratio/min": 0.2908354103565216, "sampling/sampling_logp_difference/max": 1.9460086822509766, "sampling/sampling_logp_difference/mean": 0.020634226500988007, "step": 133, "step_time": 376.208888650639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13802.0, "completions/max_terminated_length": 13802.0, "completions/mean_length": 9979.875, "completions/mean_terminated_length": 9979.875, "completions/min_length": 5474.0, "completions/min_terminated_length": 5474.0, "entropy": 0.27061324566602707, "epoch": 1.089430894308943, "frac_reward_zero_std": 0.0, "grad_norm": 0.3747713267803192, "learning_rate": 4.68e-07, "loss": 0.0181, "num_tokens": 45618380.0, "reward": 0.9764193296432495, "reward_std": 0.5758468508720398, "rewards/reward_func/mean": 0.9764193296432495, "rewards/reward_func/std": 0.575846791267395, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9906251430511475, "sampling/importance_sampling_ratio/min": 0.396713525056839, "sampling/sampling_logp_difference/max": 1.3333165645599365, "sampling/sampling_logp_difference/mean": 0.01938489079475403, "step": 134, "step_time": 360.48224557284266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16511.0, "completions/max_terminated_length": 16511.0, "completions/mean_length": 12193.5625, "completions/mean_terminated_length": 12193.5625, "completions/min_length": 7962.0, "completions/min_terminated_length": 7962.0, "entropy": 0.2715284349396825, "epoch": 1.0975609756097562, "frac_reward_zero_std": 0.0, "grad_norm": 0.3628915250301361, "learning_rate": 4.64e-07, "loss": -0.0121, "num_tokens": 46030086.0, "reward": 0.60393226146698, "reward_std": 0.7513862252235413, "rewards/reward_func/mean": 0.6057404279708862, "rewards/reward_func/std": 0.7498459815979004, "rewards/soft_overlong_punishment_reward/mean": -0.00180816650390625, "rewards/soft_overlong_punishment_reward/std": 0.007134551648050547, "sampling/importance_sampling_ratio/max": 2.5155422687530518, "sampling/importance_sampling_ratio/mean": 0.9904724359512329, "sampling/importance_sampling_ratio/min": 0.11712665855884552, "sampling/sampling_logp_difference/max": 2.1444993019104004, "sampling/sampling_logp_difference/mean": 0.019424114376306534, "step": 135, "step_time": 433.3370830931235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 20480.0, "completions/max_terminated_length": 13810.0, "completions/mean_length": 10165.46875, "completions/mean_terminated_length": 9832.7412109375, "completions/min_length": 5537.0, "completions/min_terminated_length": 5537.0, "entropy": 0.27419494558125734, "epoch": 1.1056910569105691, "frac_reward_zero_std": 0.0, "grad_norm": 0.3413371443748474, "learning_rate": 4.6e-07, "loss": -0.0679, "num_tokens": 46373613.0, "reward": 1.0951216220855713, "reward_std": 0.6765117049217224, "rewards/reward_func/mean": 1.1627061367034912, "rewards/reward_func/std": 0.5673498511314392, "rewards/soft_overlong_punishment_reward/mean": -0.03125, "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.397608518600464, "sampling/importance_sampling_ratio/mean": 0.9900693297386169, "sampling/importance_sampling_ratio/min": 0.03842584043741226, "sampling/sampling_logp_difference/max": 3.2590250968933105, "sampling/sampling_logp_difference/mean": 0.02013106644153595, "step": 136, "step_time": 417.4300327305682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13043.0, "completions/max_terminated_length": 13043.0, "completions/mean_length": 9212.1875, "completions/mean_terminated_length": 9212.1875, "completions/min_length": 4419.0, "completions/min_terminated_length": 4419.0, "entropy": 0.32444891706109047, "epoch": 1.113821138211382, "frac_reward_zero_std": 0.0, "grad_norm": 0.43532252311706543, "learning_rate": 4.56e-07, "loss": -0.0185, "num_tokens": 46680027.0, "reward": 0.972080647945404, "reward_std": 0.6541093587875366, "rewards/reward_func/mean": 0.972080647945404, "rewards/reward_func/std": 0.6541092991828918, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6352925300598145, "sampling/importance_sampling_ratio/mean": 0.9887343645095825, "sampling/importance_sampling_ratio/min": 0.02297734096646309, "sampling/sampling_logp_difference/max": 3.7732467651367188, "sampling/sampling_logp_difference/mean": 0.022742385044693947, "step": 137, "step_time": 321.7123435754329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14211.0, "completions/max_terminated_length": 14211.0, "completions/mean_length": 9940.3125, "completions/mean_terminated_length": 9940.3125, "completions/min_length": 5922.0, "completions/min_terminated_length": 5922.0, "entropy": 0.2774552209302783, "epoch": 1.1219512195121952, "frac_reward_zero_std": 0.0, "grad_norm": 0.41212156414985657, "learning_rate": 4.5199999999999997e-07, "loss": -0.0025, "num_tokens": 47020813.0, "reward": 0.8475947380065918, "reward_std": 0.6312429308891296, "rewards/reward_func/mean": 0.8475947380065918, "rewards/reward_func/std": 0.6312428712844849, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.085104465484619, "sampling/importance_sampling_ratio/mean": 0.9904999732971191, "sampling/importance_sampling_ratio/min": 0.011845460161566734, "sampling/sampling_logp_difference/max": 4.435810565948486, "sampling/sampling_logp_difference/mean": 0.01984248496592045, "step": 138, "step_time": 367.3463532931637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12646.0, "completions/max_terminated_length": 12646.0, "completions/mean_length": 9365.15625, "completions/mean_terminated_length": 9365.15625, "completions/min_length": 4490.0, "completions/min_terminated_length": 4490.0, "entropy": 0.30285973846912384, "epoch": 1.1300813008130082, "frac_reward_zero_std": 0.0, "grad_norm": 0.44502395391464233, "learning_rate": 4.48e-07, "loss": 0.0451, "num_tokens": 47334066.0, "reward": 1.4396097660064697, "reward_std": 0.8284114599227905, "rewards/reward_func/mean": 1.4396097660064697, "rewards/reward_func/std": 0.8284114003181458, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0112507343292236, "sampling/importance_sampling_ratio/mean": 0.9893834590911865, "sampling/importance_sampling_ratio/min": 0.3616284430027008, "sampling/sampling_logp_difference/max": 1.0171380043029785, "sampling/sampling_logp_difference/mean": 0.021245401352643967, "step": 139, "step_time": 320.48481974727474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15627.0, "completions/max_terminated_length": 15627.0, "completions/mean_length": 9530.0625, "completions/mean_terminated_length": 9530.0625, "completions/min_length": 2759.0, "completions/min_terminated_length": 2759.0, "entropy": 0.31202419102191925, "epoch": 1.1382113821138211, "frac_reward_zero_std": 0.0, "grad_norm": 0.42070963978767395, "learning_rate": 4.44e-07, "loss": -0.0332, "num_tokens": 47657948.0, "reward": 0.9002124071121216, "reward_std": 0.6398483514785767, "rewards/reward_func/mean": 0.9002124071121216, "rewards/reward_func/std": 0.6398482918739319, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8821748495101929, "sampling/importance_sampling_ratio/mean": 0.989189088344574, "sampling/importance_sampling_ratio/min": 0.011666374281048775, "sampling/sampling_logp_difference/max": 4.45104455947876, "sampling/sampling_logp_difference/mean": 0.02178381383419037, "step": 140, "step_time": 358.79695148952305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 20480.0, "completions/max_terminated_length": 14658.0, "completions/mean_length": 10385.03125, "completions/mean_terminated_length": 10059.38671875, "completions/min_length": 3841.0, "completions/min_terminated_length": 3841.0, "entropy": 0.29269325640052557, "epoch": 1.146341463414634, "frac_reward_zero_std": 0.0, "grad_norm": 0.3514389991760254, "learning_rate": 4.3999999999999997e-07, "loss": -0.0493, "num_tokens": 48003493.0, "reward": 0.6447830200195312, "reward_std": 0.72358638048172, "rewards/reward_func/mean": 0.6760330200195312, "rewards/reward_func/std": 0.6698598265647888, "rewards/soft_overlong_punishment_reward/mean": -0.03125, "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9895683526992798, "sampling/importance_sampling_ratio/min": 0.37845414876937866, "sampling/sampling_logp_difference/max": 1.1276581287384033, "sampling/sampling_logp_difference/mean": 0.020915433764457703, "step": 141, "step_time": 402.56251001451164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13964.0, "completions/max_terminated_length": 13964.0, "completions/mean_length": 7764.46875, "completions/mean_terminated_length": 7764.46875, "completions/min_length": 3539.0, "completions/min_terminated_length": 3539.0, "entropy": 0.2780707832425833, "epoch": 1.1544715447154472, "frac_reward_zero_std": 0.0, "grad_norm": 0.44146835803985596, "learning_rate": 4.36e-07, "loss": 0.0156, "num_tokens": 48265428.0, "reward": 5.029313087463379, "reward_std": 7.416443824768066, "rewards/reward_func/mean": 5.029313087463379, "rewards/reward_func/std": 7.41644287109375, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902679324150085, "sampling/importance_sampling_ratio/min": 0.0015649524284526706, "sampling/sampling_logp_difference/max": 6.45989990234375, "sampling/sampling_logp_difference/mean": 0.019718850031495094, "step": 142, "step_time": 491.4185498545412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15411.0, "completions/max_terminated_length": 15411.0, "completions/mean_length": 11449.90625, "completions/mean_terminated_length": 11449.90625, "completions/min_length": 7684.0, "completions/min_terminated_length": 7684.0, "entropy": 0.2600766168907285, "epoch": 1.1626016260162602, "frac_reward_zero_std": 0.0, "grad_norm": 0.34485191106796265, "learning_rate": 4.3199999999999995e-07, "loss": 0.0211, "num_tokens": 48654609.0, "reward": 1.2525250911712646, "reward_std": 0.9412046670913696, "rewards/reward_func/mean": 1.2525250911712646, "rewards/reward_func/std": 0.9412046670913696, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.541574716567993, "sampling/importance_sampling_ratio/mean": 0.9909282326698303, "sampling/importance_sampling_ratio/min": 0.34744107723236084, "sampling/sampling_logp_difference/max": 1.0571601390838623, "sampling/sampling_logp_difference/mean": 0.01887846551835537, "step": 143, "step_time": 605.705682055559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15606.0, "completions/max_terminated_length": 15606.0, "completions/mean_length": 10148.96875, "completions/mean_terminated_length": 10148.96875, "completions/min_length": 6432.0, "completions/min_terminated_length": 6432.0, "entropy": 0.28575040213763714, "epoch": 1.170731707317073, "frac_reward_zero_std": 0.0, "grad_norm": 0.3891197741031647, "learning_rate": 4.2799999999999997e-07, "loss": 0.0339, "num_tokens": 48996144.0, "reward": 2.4553003311157227, "reward_std": 3.728713274002075, "rewards/reward_func/mean": 2.4553003311157227, "rewards/reward_func/std": 3.728713274002075, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901723861694336, "sampling/importance_sampling_ratio/min": 0.02774185873568058, "sampling/sampling_logp_difference/max": 3.584812879562378, "sampling/sampling_logp_difference/mean": 0.02012861892580986, "step": 144, "step_time": 440.6931741584558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14661.0, "completions/max_terminated_length": 14661.0, "completions/mean_length": 10413.6875, "completions/mean_terminated_length": 10413.6875, "completions/min_length": 5738.0, "completions/min_terminated_length": 5738.0, "entropy": 0.2827681256458163, "epoch": 1.1788617886178863, "frac_reward_zero_std": 0.0, "grad_norm": 0.398101806640625, "learning_rate": 4.24e-07, "loss": 0.0233, "num_tokens": 49352246.0, "reward": 0.9063984751701355, "reward_std": 0.6884288787841797, "rewards/reward_func/mean": 0.9063984751701355, "rewards/reward_func/std": 0.6884288787841797, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.003222703933716, "sampling/importance_sampling_ratio/mean": 0.9902200102806091, "sampling/importance_sampling_ratio/min": 0.07585802674293518, "sampling/sampling_logp_difference/max": 2.5788917541503906, "sampling/sampling_logp_difference/mean": 0.0197792686522007, "step": 145, "step_time": 369.9440093538724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13108.0, "completions/max_terminated_length": 13108.0, "completions/mean_length": 8945.3125, "completions/mean_terminated_length": 8945.3125, "completions/min_length": 4501.0, "completions/min_terminated_length": 4501.0, "entropy": 0.3082218300551176, "epoch": 1.1869918699186992, "frac_reward_zero_std": 0.0, "grad_norm": 0.43946680426597595, "learning_rate": 4.1999999999999995e-07, "loss": -0.0153, "num_tokens": 49652336.0, "reward": 1.0096275806427002, "reward_std": 0.6723037958145142, "rewards/reward_func/mean": 1.0096275806427002, "rewards/reward_func/std": 0.6723037958145142, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.288572072982788, "sampling/importance_sampling_ratio/mean": 0.9894624948501587, "sampling/importance_sampling_ratio/min": 2.8281038377440468e-18, "sampling/sampling_logp_difference/max": 40.406925201416016, "sampling/sampling_logp_difference/mean": 0.021884791553020477, "step": 146, "step_time": 309.9006433764007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13444.0, "completions/max_terminated_length": 13444.0, "completions/mean_length": 9562.09375, "completions/mean_terminated_length": 9562.09375, "completions/min_length": 5406.0, "completions/min_terminated_length": 5406.0, "entropy": 0.29419814236462116, "epoch": 1.1951219512195121, "frac_reward_zero_std": 0.0, "grad_norm": 0.41722893714904785, "learning_rate": 4.1599999999999997e-07, "loss": 0.0188, "num_tokens": 49973571.0, "reward": 1.3096396923065186, "reward_std": 0.5881022214889526, "rewards/reward_func/mean": 1.3096396923065186, "rewards/reward_func/std": 0.5881022214889526, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.009949207305908, "sampling/importance_sampling_ratio/mean": 0.9897855520248413, "sampling/importance_sampling_ratio/min": 0.03852084279060364, "sampling/sampling_logp_difference/max": 3.2565557956695557, "sampling/sampling_logp_difference/mean": 0.020911747589707375, "step": 147, "step_time": 319.92451414023526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14260.0, "completions/max_terminated_length": 14260.0, "completions/mean_length": 9389.1875, "completions/mean_terminated_length": 9389.1875, "completions/min_length": 4720.0, "completions/min_terminated_length": 4720.0, "entropy": 0.2792348377406597, "epoch": 1.203252032520325, "frac_reward_zero_std": 0.0, "grad_norm": 0.4073498547077179, "learning_rate": 4.12e-07, "loss": -0.0508, "num_tokens": 50295825.0, "reward": 1.020705223083496, "reward_std": 0.5470594167709351, "rewards/reward_func/mean": 1.020705223083496, "rewards/reward_func/std": 0.5470594167709351, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9243781566619873, "sampling/importance_sampling_ratio/mean": 0.9900591969490051, "sampling/importance_sampling_ratio/min": 0.13670256733894348, "sampling/sampling_logp_difference/max": 1.9899476766586304, "sampling/sampling_logp_difference/mean": 0.02007705718278885, "step": 148, "step_time": 379.96521122637205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12088.0, "completions/max_terminated_length": 12088.0, "completions/mean_length": 8509.34375, "completions/mean_terminated_length": 8509.34375, "completions/min_length": 5576.0, "completions/min_terminated_length": 5576.0, "entropy": 0.293182197958231, "epoch": 1.2113821138211383, "frac_reward_zero_std": 0.0, "grad_norm": 0.41748881340026855, "learning_rate": 4.0799999999999995e-07, "loss": 0.0059, "num_tokens": 50583228.0, "reward": 1.2018049955368042, "reward_std": 0.4176173210144043, "rewards/reward_func/mean": 1.2018049955368042, "rewards/reward_func/std": 0.4176173210144043, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9898430705070496, "sampling/importance_sampling_ratio/min": 0.21498073637485504, "sampling/sampling_logp_difference/max": 1.5372068881988525, "sampling/sampling_logp_difference/mean": 0.02043253555893898, "step": 149, "step_time": 274.8332504169084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13540.0, "completions/max_terminated_length": 13540.0, "completions/mean_length": 8832.40625, "completions/mean_terminated_length": 8832.40625, "completions/min_length": 5019.0, "completions/min_terminated_length": 5019.0, "entropy": 0.30714407935738564, "epoch": 1.2195121951219512, "frac_reward_zero_std": 0.0, "grad_norm": 0.4428982138633728, "learning_rate": 4.04e-07, "loss": -0.0408, "num_tokens": 50876777.0, "reward": 1.8848071098327637, "reward_std": 1.2892030477523804, "rewards/reward_func/mean": 1.8848071098327637, "rewards/reward_func/std": 1.2892030477523804, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8551710844039917, "sampling/importance_sampling_ratio/mean": 0.9889999032020569, "sampling/importance_sampling_ratio/min": 0.2573316991329193, "sampling/sampling_logp_difference/max": 1.3573893308639526, "sampling/sampling_logp_difference/mean": 0.022047821432352066, "step": 150, "step_time": 301.8369232052937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15122.0, "completions/max_terminated_length": 15122.0, "completions/mean_length": 9705.6875, "completions/mean_terminated_length": 9705.6875, "completions/min_length": 4756.0, "completions/min_terminated_length": 4756.0, "entropy": 0.2828159620985389, "epoch": 1.2276422764227641, "frac_reward_zero_std": 0.0, "grad_norm": 0.4180329144001007, "learning_rate": 4e-07, "loss": 0.0365, "num_tokens": 51209559.0, "reward": 1.167612075805664, "reward_std": 0.3664146959781647, "rewards/reward_func/mean": 1.167612075805664, "rewards/reward_func/std": 0.3664146959781647, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.355583906173706, "sampling/importance_sampling_ratio/mean": 0.990098774433136, "sampling/importance_sampling_ratio/min": 0.007337617687880993, "sampling/sampling_logp_difference/max": 4.914741039276123, "sampling/sampling_logp_difference/mean": 0.020141858607530594, "step": 151, "step_time": 430.1349473600276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16404.0, "completions/max_terminated_length": 16404.0, "completions/mean_length": 9639.8125, "completions/mean_terminated_length": 9639.8125, "completions/min_length": 2322.0, "completions/min_terminated_length": 2322.0, "entropy": 0.2881935928016901, "epoch": 1.2357723577235773, "frac_reward_zero_std": 0.0, "grad_norm": 0.3870021402835846, "learning_rate": 3.96e-07, "loss": 0.0149, "num_tokens": 51537321.0, "reward": 0.9652100205421448, "reward_std": 0.6336724758148193, "rewards/reward_func/mean": 0.9653626084327698, "rewards/reward_func/std": 0.6334443092346191, "rewards/soft_overlong_punishment_reward/mean": -0.000152587890625, "rewards/soft_overlong_punishment_reward/std": 0.0008631674572825432, "sampling/importance_sampling_ratio/max": 2.0554513931274414, "sampling/importance_sampling_ratio/mean": 0.9902279376983643, "sampling/importance_sampling_ratio/min": 0.1997445672750473, "sampling/sampling_logp_difference/max": 1.6107158660888672, "sampling/sampling_logp_difference/mean": 0.020111925899982452, "step": 152, "step_time": 403.21197831467725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15311.0, "completions/max_terminated_length": 15311.0, "completions/mean_length": 9762.4375, "completions/mean_terminated_length": 9762.4375, "completions/min_length": 4994.0, "completions/min_terminated_length": 4994.0, "entropy": 0.26577026676386595, "epoch": 1.2439024390243902, "frac_reward_zero_std": 0.0, "grad_norm": 0.9132823944091797, "learning_rate": 3.92e-07, "loss": 0.0401, "num_tokens": 51871503.0, "reward": 1.0906537771224976, "reward_std": 1.0960371494293213, "rewards/reward_func/mean": 1.0906537771224976, "rewards/reward_func/std": 1.0960370302200317, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9909335374832153, "sampling/importance_sampling_ratio/min": 0.032161299139261246, "sampling/sampling_logp_difference/max": 3.4369914531707764, "sampling/sampling_logp_difference/mean": 0.018757443875074387, "step": 153, "step_time": 492.0377260663081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15140.0, "completions/max_terminated_length": 15140.0, "completions/mean_length": 8624.96875, "completions/mean_terminated_length": 8624.96875, "completions/min_length": 4203.0, "completions/min_terminated_length": 4203.0, "entropy": 0.28610084764659405, "epoch": 1.2520325203252032, "frac_reward_zero_std": 0.0, "grad_norm": 0.4432617127895355, "learning_rate": 3.88e-07, "loss": 0.0781, "num_tokens": 52167454.0, "reward": 1.4906249046325684, "reward_std": 1.0208377838134766, "rewards/reward_func/mean": 1.4906249046325684, "rewards/reward_func/std": 1.0208377838134766, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.563368558883667, "sampling/importance_sampling_ratio/mean": 0.9902315139770508, "sampling/importance_sampling_ratio/min": 0.36291512846946716, "sampling/sampling_logp_difference/max": 1.0135862827301025, "sampling/sampling_logp_difference/mean": 0.02024753764271736, "step": 154, "step_time": 493.67347326362506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13499.0, "completions/max_terminated_length": 13499.0, "completions/mean_length": 10185.09375, "completions/mean_terminated_length": 10185.09375, "completions/min_length": 5779.0, "completions/min_terminated_length": 5779.0, "entropy": 0.2801782637834549, "epoch": 1.2601626016260163, "frac_reward_zero_std": 0.0, "grad_norm": 0.3910183310508728, "learning_rate": 3.84e-07, "loss": 0.0005, "num_tokens": 52513713.0, "reward": 1.3181421756744385, "reward_std": 0.4532380700111389, "rewards/reward_func/mean": 1.3181421756744385, "rewards/reward_func/std": 0.45323804020881653, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9903202056884766, "sampling/importance_sampling_ratio/min": 0.3429674208164215, "sampling/sampling_logp_difference/max": 1.1217701435089111, "sampling/sampling_logp_difference/mean": 0.01968192681670189, "step": 155, "step_time": 330.93778187967837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15302.0, "completions/max_terminated_length": 15302.0, "completions/mean_length": 9648.78125, "completions/mean_terminated_length": 9648.78125, "completions/min_length": 5341.0, "completions/min_terminated_length": 5341.0, "entropy": 0.2959721256047487, "epoch": 1.2682926829268293, "frac_reward_zero_std": 0.0, "grad_norm": 0.3946113884449005, "learning_rate": 3.7999999999999996e-07, "loss": -0.0071, "num_tokens": 52837898.0, "reward": 1.5226918458938599, "reward_std": 1.2326607704162598, "rewards/reward_func/mean": 1.5226918458938599, "rewards/reward_func/std": 1.2326607704162598, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7765085697174072, "sampling/importance_sampling_ratio/mean": 0.9898279905319214, "sampling/importance_sampling_ratio/min": 0.2457374781370163, "sampling/sampling_logp_difference/max": 1.403491497039795, "sampling/sampling_logp_difference/mean": 0.021123500540852547, "step": 156, "step_time": 403.72100708354264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12292.0, "completions/max_terminated_length": 12292.0, "completions/mean_length": 8318.3125, "completions/mean_terminated_length": 8318.3125, "completions/min_length": 4767.0, "completions/min_terminated_length": 4767.0, "entropy": 0.3122062496840954, "epoch": 1.2764227642276422, "frac_reward_zero_std": 0.0, "grad_norm": 0.45133042335510254, "learning_rate": 3.76e-07, "loss": -0.0431, "num_tokens": 53113636.0, "reward": 1.1949347257614136, "reward_std": 1.1354265213012695, "rewards/reward_func/mean": 1.1949347257614136, "rewards/reward_func/std": 1.1354265213012695, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0942885875701904, "sampling/importance_sampling_ratio/mean": 0.9893242120742798, "sampling/importance_sampling_ratio/min": 0.0679553896188736, "sampling/sampling_logp_difference/max": 2.68890380859375, "sampling/sampling_logp_difference/mean": 0.021612364798784256, "step": 157, "step_time": 298.1043352710549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16644.0, "completions/max_terminated_length": 16644.0, "completions/mean_length": 9605.53125, "completions/mean_terminated_length": 9605.53125, "completions/min_length": 2687.0, "completions/min_terminated_length": 2687.0, "entropy": 0.3122168593108654, "epoch": 1.2845528455284554, "frac_reward_zero_std": 0.0, "grad_norm": 0.4598052203655243, "learning_rate": 3.72e-07, "loss": 0.011, "num_tokens": 53431205.0, "reward": 1.7465304136276245, "reward_std": 1.7538690567016602, "rewards/reward_func/mean": 1.7485140562057495, "rewards/reward_func/std": 1.7518490552902222, "rewards/soft_overlong_punishment_reward/mean": -0.001983642578125, "rewards/soft_overlong_punishment_reward/std": 0.011221176944673061, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9891939163208008, "sampling/importance_sampling_ratio/min": 0.3125458359718323, "sampling/sampling_logp_difference/max": 1.1630041599273682, "sampling/sampling_logp_difference/mean": 0.021935712546110153, "step": 158, "step_time": 374.17676453036256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14982.0, "completions/max_terminated_length": 14982.0, "completions/mean_length": 11967.25, "completions/mean_terminated_length": 11967.25, "completions/min_length": 9173.0, "completions/min_terminated_length": 9173.0, "entropy": 0.2647007992491126, "epoch": 1.2926829268292683, "frac_reward_zero_std": 0.0, "grad_norm": 0.34376952052116394, "learning_rate": 3.6799999999999996e-07, "loss": 0.0327, "num_tokens": 53833773.0, "reward": 0.8059707880020142, "reward_std": 0.7507649660110474, "rewards/reward_func/mean": 0.8059707880020142, "rewards/reward_func/std": 0.7507650256156921, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3729827404022217, "sampling/importance_sampling_ratio/mean": 0.9908524751663208, "sampling/importance_sampling_ratio/min": 5.715029374186997e-07, "sampling/sampling_logp_difference/max": 14.374996185302734, "sampling/sampling_logp_difference/mean": 0.0191708542406559, "step": 159, "step_time": 379.8988157734275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10870.0, "completions/max_terminated_length": 10870.0, "completions/mean_length": 8080.4375, "completions/mean_terminated_length": 8080.4375, "completions/min_length": 5016.0, "completions/min_terminated_length": 5016.0, "entropy": 0.30127510614693165, "epoch": 1.3008130081300813, "frac_reward_zero_std": 0.0, "grad_norm": 0.4035295248031616, "learning_rate": 3.64e-07, "loss": -0.0268, "num_tokens": 54104107.0, "reward": 1.0803825855255127, "reward_std": 0.462587833404541, "rewards/reward_func/mean": 1.0803825855255127, "rewards/reward_func/std": 0.4625878632068634, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0112428665161133, "sampling/importance_sampling_ratio/mean": 0.9894917011260986, "sampling/importance_sampling_ratio/min": 0.26328322291374207, "sampling/sampling_logp_difference/max": 1.3345248699188232, "sampling/sampling_logp_difference/mean": 0.021139763295650482, "step": 160, "step_time": 253.5028428370133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11090.0, "completions/max_terminated_length": 11090.0, "completions/mean_length": 5596.53125, "completions/mean_terminated_length": 5596.53125, "completions/min_length": 2069.0, "completions/min_terminated_length": 2069.0, "entropy": 0.33193016052246094, "epoch": 1.3089430894308944, "frac_reward_zero_std": 0.0, "grad_norm": 0.546795666217804, "learning_rate": 3.6e-07, "loss": -0.0351, "num_tokens": 54291492.0, "reward": 1.0095927715301514, "reward_std": 0.5363887548446655, "rewards/reward_func/mean": 1.0095927715301514, "rewards/reward_func/std": 0.5363887548446655, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0163543224334717, "sampling/importance_sampling_ratio/mean": 0.9889829158782959, "sampling/importance_sampling_ratio/min": 0.24333754181861877, "sampling/sampling_logp_difference/max": 1.4133057594299316, "sampling/sampling_logp_difference/mean": 0.022219812497496605, "step": 161, "step_time": 240.68693689699285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14764.0, "completions/max_terminated_length": 14764.0, "completions/mean_length": 10763.0625, "completions/mean_terminated_length": 10763.0625, "completions/min_length": 7169.0, "completions/min_terminated_length": 7169.0, "entropy": 0.28575049340724945, "epoch": 1.3170731707317074, "frac_reward_zero_std": 0.25, "grad_norm": 0.330822229385376, "learning_rate": 3.5599999999999996e-07, "loss": 0.0071, "num_tokens": 54660694.0, "reward": 0.9929087162017822, "reward_std": 0.6908224821090698, "rewards/reward_func/mean": 0.9929087162017822, "rewards/reward_func/std": 0.690822422504425, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9898582100868225, "sampling/importance_sampling_ratio/min": 0.14805620908737183, "sampling/sampling_logp_difference/max": 1.9101632833480835, "sampling/sampling_logp_difference/mean": 0.020126353949308395, "step": 162, "step_time": 551.6217741372529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15333.0, "completions/max_terminated_length": 15333.0, "completions/mean_length": 11018.78125, "completions/mean_terminated_length": 11018.78125, "completions/min_length": 8906.0, "completions/min_terminated_length": 8906.0, "entropy": 0.2753280848264694, "epoch": 1.3252032520325203, "frac_reward_zero_std": 0.0, "grad_norm": 0.3719634711742401, "learning_rate": 3.52e-07, "loss": 0.0282, "num_tokens": 55038287.0, "reward": 1.00343656539917, "reward_std": 0.800672173500061, "rewards/reward_func/mean": 1.00343656539917, "rewards/reward_func/std": 0.800672173500061, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.007370710372925, "sampling/importance_sampling_ratio/mean": 0.9903607964515686, "sampling/importance_sampling_ratio/min": 0.00020890739688184112, "sampling/sampling_logp_difference/max": 8.47361946105957, "sampling/sampling_logp_difference/mean": 0.019656600430607796, "step": 163, "step_time": 544.9398823149968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15932.0, "completions/max_terminated_length": 15932.0, "completions/mean_length": 10442.1875, "completions/mean_terminated_length": 10442.1875, "completions/min_length": 6051.0, "completions/min_terminated_length": 6051.0, "entropy": 0.30813820846378803, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.3646389245986938, "learning_rate": 3.4799999999999994e-07, "loss": 0.025, "num_tokens": 55389725.0, "reward": 1.1668248176574707, "reward_std": 0.6954712867736816, "rewards/reward_func/mean": 1.1668248176574707, "rewards/reward_func/std": 0.6954712867736816, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9248101711273193, "sampling/importance_sampling_ratio/mean": 0.9890936613082886, "sampling/importance_sampling_ratio/min": 0.008675693534314632, "sampling/sampling_logp_difference/max": 4.747230052947998, "sampling/sampling_logp_difference/mean": 0.021686844527721405, "step": 164, "step_time": 348.8858406627551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14837.0, "completions/max_terminated_length": 14837.0, "completions/mean_length": 8969.65625, "completions/mean_terminated_length": 8969.65625, "completions/min_length": 5282.0, "completions/min_terminated_length": 5282.0, "entropy": 0.2883797576650977, "epoch": 1.3414634146341464, "frac_reward_zero_std": 0.0, "grad_norm": 0.4131164252758026, "learning_rate": 3.4399999999999996e-07, "loss": 0.0886, "num_tokens": 55692242.0, "reward": 1.4298856258392334, "reward_std": 1.167038917541504, "rewards/reward_func/mean": 1.4298856258392334, "rewards/reward_func/std": 1.167038917541504, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2284605503082275, "sampling/importance_sampling_ratio/mean": 0.9899915456771851, "sampling/importance_sampling_ratio/min": 0.36445239186286926, "sampling/sampling_logp_difference/max": 1.009359359741211, "sampling/sampling_logp_difference/mean": 0.019986702129244804, "step": 165, "step_time": 543.1110370436218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14801.0, "completions/max_terminated_length": 14801.0, "completions/mean_length": 9674.0625, "completions/mean_terminated_length": 9674.0625, "completions/min_length": 5760.0, "completions/min_terminated_length": 5760.0, "entropy": 0.2983303349465132, "epoch": 1.3495934959349594, "frac_reward_zero_std": 0.0, "grad_norm": 5.761723518371582, "learning_rate": 3.4000000000000003e-07, "loss": 0.022, "num_tokens": 56013684.0, "reward": 1.345805287361145, "reward_std": 0.7664235830307007, "rewards/reward_func/mean": 1.345805287361145, "rewards/reward_func/std": 0.7664235830307007, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5187416076660156, "sampling/importance_sampling_ratio/mean": 0.9897916316986084, "sampling/importance_sampling_ratio/min": 0.25386354327201843, "sampling/sampling_logp_difference/max": 1.3709583282470703, "sampling/sampling_logp_difference/mean": 0.02112661674618721, "step": 166, "step_time": 493.43985287938267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15028.0, "completions/max_terminated_length": 15028.0, "completions/mean_length": 9629.9375, "completions/mean_terminated_length": 9629.9375, "completions/min_length": 3882.0, "completions/min_terminated_length": 3882.0, "entropy": 0.26501744147390127, "epoch": 1.3577235772357723, "frac_reward_zero_std": 0.0, "grad_norm": 0.4020431339740753, "learning_rate": 3.36e-07, "loss": 0.0422, "num_tokens": 56355258.0, "reward": 1.1285045146942139, "reward_std": 0.537778377532959, "rewards/reward_func/mean": 1.1285045146942139, "rewards/reward_func/std": 0.537778377532959, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9907045364379883, "sampling/importance_sampling_ratio/min": 0.003646957455202937, "sampling/sampling_logp_difference/max": 5.613862037658691, "sampling/sampling_logp_difference/mean": 0.01883198693394661, "step": 167, "step_time": 346.18117096996866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14349.0, "completions/max_terminated_length": 14349.0, "completions/mean_length": 8990.5625, "completions/mean_terminated_length": 8990.5625, "completions/min_length": 5768.0, "completions/min_terminated_length": 5768.0, "entropy": 0.3040621541440487, "epoch": 1.3658536585365852, "frac_reward_zero_std": 0.0, "grad_norm": 0.4271112084388733, "learning_rate": 3.32e-07, "loss": -0.0135, "num_tokens": 56657932.0, "reward": 1.023315668106079, "reward_std": 0.5972030162811279, "rewards/reward_func/mean": 1.023315668106079, "rewards/reward_func/std": 0.5972029566764832, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.104907512664795, "sampling/importance_sampling_ratio/mean": 0.9894360303878784, "sampling/importance_sampling_ratio/min": 0.331084668636322, "sampling/sampling_logp_difference/max": 1.1053811311721802, "sampling/sampling_logp_difference/mean": 0.021181510761380196, "step": 168, "step_time": 523.4633230310865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13526.0, "completions/max_terminated_length": 13526.0, "completions/mean_length": 6832.15625, "completions/mean_terminated_length": 6832.15625, "completions/min_length": 2270.0, "completions/min_terminated_length": 2270.0, "entropy": 0.3238808959722519, "epoch": 1.3739837398373984, "frac_reward_zero_std": 0.0, "grad_norm": 0.5131815671920776, "learning_rate": 3.28e-07, "loss": -0.0588, "num_tokens": 56888113.0, "reward": 1.2511417865753174, "reward_std": 0.5019726157188416, "rewards/reward_func/mean": 1.2511417865753174, "rewards/reward_func/std": 0.5019726157188416, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.56583309173584, "sampling/importance_sampling_ratio/mean": 0.9888361692428589, "sampling/importance_sampling_ratio/min": 0.42743268609046936, "sampling/sampling_logp_difference/max": 0.9422832727432251, "sampling/sampling_logp_difference/mean": 0.022333543747663498, "step": 169, "step_time": 282.8712672749534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17421.0, "completions/max_terminated_length": 17421.0, "completions/mean_length": 10190.46875, "completions/mean_terminated_length": 10190.46875, "completions/min_length": 6579.0, "completions/min_terminated_length": 6579.0, "entropy": 0.305131521075964, "epoch": 1.3821138211382114, "frac_reward_zero_std": 0.0, "grad_norm": 0.3988380432128906, "learning_rate": 3.24e-07, "loss": 0.0232, "num_tokens": 57231496.0, "reward": 1.644756555557251, "reward_std": 1.3392294645309448, "rewards/reward_func/mean": 1.6526682376861572, "rewards/reward_func/std": 1.3286665678024292, "rewards/soft_overlong_punishment_reward/mean": -0.00791168212890625, "rewards/soft_overlong_punishment_reward/std": 0.0447552315890789, "sampling/importance_sampling_ratio/max": 2.1310908794403076, "sampling/importance_sampling_ratio/mean": 0.989220142364502, "sampling/importance_sampling_ratio/min": 0.04181424155831337, "sampling/sampling_logp_difference/max": 3.174518346786499, "sampling/sampling_logp_difference/mean": 0.021304449066519737, "step": 170, "step_time": 494.5488368184306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13615.0, "completions/max_terminated_length": 13615.0, "completions/mean_length": 8635.6875, "completions/mean_terminated_length": 8635.6875, "completions/min_length": 3753.0, "completions/min_terminated_length": 3753.0, "entropy": 0.315045366063714, "epoch": 1.3902439024390243, "frac_reward_zero_std": 0.0, "grad_norm": 0.4601956605911255, "learning_rate": 3.2e-07, "loss": 0.009, "num_tokens": 57518430.0, "reward": 1.3443162441253662, "reward_std": 0.39488282799720764, "rewards/reward_func/mean": 1.3443162441253662, "rewards/reward_func/std": 0.39488279819488525, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.062009572982788, "sampling/importance_sampling_ratio/mean": 0.9892582893371582, "sampling/importance_sampling_ratio/min": 0.43613678216934204, "sampling/sampling_logp_difference/max": 0.8297994136810303, "sampling/sampling_logp_difference/mean": 0.021855181083083153, "step": 171, "step_time": 453.36320838704705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13453.0, "completions/max_terminated_length": 13453.0, "completions/mean_length": 8726.84375, "completions/mean_terminated_length": 8726.84375, "completions/min_length": 6157.0, "completions/min_terminated_length": 6157.0, "entropy": 0.3162997905164957, "epoch": 1.3983739837398375, "frac_reward_zero_std": 0.0, "grad_norm": 0.4318312704563141, "learning_rate": 3.1599999999999997e-07, "loss": 0.0394, "num_tokens": 57809209.0, "reward": 1.3840892314910889, "reward_std": 0.6176471710205078, "rewards/reward_func/mean": 1.3840892314910889, "rewards/reward_func/std": 0.6176471710205078, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5114269256591797, "sampling/importance_sampling_ratio/mean": 0.9889503717422485, "sampling/importance_sampling_ratio/min": 0.022857630625367165, "sampling/sampling_logp_difference/max": 3.778470277786255, "sampling/sampling_logp_difference/mean": 0.02213275618851185, "step": 172, "step_time": 301.59747786587104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13227.0, "completions/max_terminated_length": 13227.0, "completions/mean_length": 8122.09375, "completions/mean_terminated_length": 8122.09375, "completions/min_length": 4053.0, "completions/min_terminated_length": 4053.0, "entropy": 0.2920794412493706, "epoch": 1.4065040650406504, "frac_reward_zero_std": 0.0, "grad_norm": 0.44486507773399353, "learning_rate": 3.12e-07, "loss": 0.0322, "num_tokens": 58092372.0, "reward": 1.4264354705810547, "reward_std": 0.8911484479904175, "rewards/reward_func/mean": 1.4264354705810547, "rewards/reward_func/std": 0.8911485075950623, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9848713874816895, "sampling/importance_sampling_ratio/mean": 0.9898254871368408, "sampling/importance_sampling_ratio/min": 0.0660436749458313, "sampling/sampling_logp_difference/max": 2.7174389362335205, "sampling/sampling_logp_difference/mean": 0.02044767141342163, "step": 173, "step_time": 399.5025182967074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15866.0, "completions/max_terminated_length": 15866.0, "completions/mean_length": 8942.84375, "completions/mean_terminated_length": 8942.84375, "completions/min_length": 5281.0, "completions/min_terminated_length": 5281.0, "entropy": 0.28330389875918627, "epoch": 1.4146341463414633, "frac_reward_zero_std": 0.0, "grad_norm": 0.43173524737358093, "learning_rate": 3.08e-07, "loss": 0.0439, "num_tokens": 58399383.0, "reward": 1.0660076141357422, "reward_std": 0.5465989112854004, "rewards/reward_func/mean": 1.0660076141357422, "rewards/reward_func/std": 0.5465989708900452, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1449825763702393, "sampling/importance_sampling_ratio/mean": 0.9901489019393921, "sampling/importance_sampling_ratio/min": 0.2062222957611084, "sampling/sampling_logp_difference/max": 1.5788005590438843, "sampling/sampling_logp_difference/mean": 0.01982881873846054, "step": 174, "step_time": 378.2133640507236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16162.0, "completions/max_terminated_length": 16162.0, "completions/mean_length": 7844.625, "completions/mean_terminated_length": 7844.625, "completions/min_length": 2988.0, "completions/min_terminated_length": 2988.0, "entropy": 0.31094569712877274, "epoch": 1.4227642276422765, "frac_reward_zero_std": 0.0, "grad_norm": 0.4638010263442993, "learning_rate": 3.0399999999999997e-07, "loss": -0.0176, "num_tokens": 58662163.0, "reward": 1.745612382888794, "reward_std": 1.4541947841644287, "rewards/reward_func/mean": 1.745612382888794, "rewards/reward_func/std": 1.4541946649551392, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7884289026260376, "sampling/importance_sampling_ratio/mean": 0.9891934990882874, "sampling/importance_sampling_ratio/min": 0.24390622973442078, "sampling/sampling_logp_difference/max": 1.4109714031219482, "sampling/sampling_logp_difference/mean": 0.021349426358938217, "step": 175, "step_time": 534.4618561277166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12543.0, "completions/max_terminated_length": 12543.0, "completions/mean_length": 8796.625, "completions/mean_terminated_length": 8796.625, "completions/min_length": 2902.0, "completions/min_terminated_length": 2902.0, "entropy": 0.2846886198967695, "epoch": 1.4308943089430894, "frac_reward_zero_std": 0.0, "grad_norm": 0.3884636461734772, "learning_rate": 3e-07, "loss": -0.0042, "num_tokens": 58965455.0, "reward": 1.2353037595748901, "reward_std": 0.3446652889251709, "rewards/reward_func/mean": 1.2353037595748901, "rewards/reward_func/std": 0.3446652591228485, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.538331985473633, "sampling/importance_sampling_ratio/mean": 0.9902387857437134, "sampling/importance_sampling_ratio/min": 0.05407044291496277, "sampling/sampling_logp_difference/max": 2.9174675941467285, "sampling/sampling_logp_difference/mean": 0.019742220640182495, "step": 176, "step_time": 344.33025532308966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15643.0, "completions/max_terminated_length": 15643.0, "completions/mean_length": 10607.03125, "completions/mean_terminated_length": 10607.03125, "completions/min_length": 7345.0, "completions/min_terminated_length": 7345.0, "entropy": 0.3004197124391794, "epoch": 1.4390243902439024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3924950659275055, "learning_rate": 2.9599999999999995e-07, "loss": 0.007, "num_tokens": 59328144.0, "reward": 0.827019214630127, "reward_std": 0.663148045539856, "rewards/reward_func/mean": 0.827019214630127, "rewards/reward_func/std": 0.663148045539856, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9895559549331665, "sampling/importance_sampling_ratio/min": 0.00959984865039587, "sampling/sampling_logp_difference/max": 4.646008014678955, "sampling/sampling_logp_difference/mean": 0.021044539287686348, "step": 177, "step_time": 421.6234840967227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13202.0, "completions/max_terminated_length": 13202.0, "completions/mean_length": 8350.4375, "completions/mean_terminated_length": 8350.4375, "completions/min_length": 6289.0, "completions/min_terminated_length": 6289.0, "entropy": 0.3156953826546669, "epoch": 1.4471544715447155, "frac_reward_zero_std": 0.0, "grad_norm": 0.4544045627117157, "learning_rate": 2.9199999999999997e-07, "loss": 0.0208, "num_tokens": 59610238.0, "reward": 0.9881260395050049, "reward_std": 0.4920128583908081, "rewards/reward_func/mean": 0.9881260395050049, "rewards/reward_func/std": 0.4920128285884857, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.523590087890625, "sampling/importance_sampling_ratio/mean": 0.9888817071914673, "sampling/importance_sampling_ratio/min": 0.265560507774353, "sampling/sampling_logp_difference/max": 1.325912594795227, "sampling/sampling_logp_difference/mean": 0.021911103278398514, "step": 178, "step_time": 292.23001111787744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12649.0, "completions/max_terminated_length": 12649.0, "completions/mean_length": 10555.78125, "completions/mean_terminated_length": 10555.78125, "completions/min_length": 7665.0, "completions/min_terminated_length": 7665.0, "entropy": 0.28271804563701153, "epoch": 1.4552845528455285, "frac_reward_zero_std": 0.0, "grad_norm": 0.36825451254844666, "learning_rate": 2.88e-07, "loss": -0.0295, "num_tokens": 59966983.0, "reward": 1.053284764289856, "reward_std": 0.481535404920578, "rewards/reward_func/mean": 1.053284764289856, "rewards/reward_func/std": 0.4815354347229004, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7443697452545166, "sampling/importance_sampling_ratio/mean": 0.9902329444885254, "sampling/importance_sampling_ratio/min": 0.21681459248065948, "sampling/sampling_logp_difference/max": 1.5287127494812012, "sampling/sampling_logp_difference/mean": 0.019804177805781364, "step": 179, "step_time": 316.3304076856002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18300.0, "completions/max_terminated_length": 18300.0, "completions/mean_length": 9951.625, "completions/mean_terminated_length": 9951.625, "completions/min_length": 2899.0, "completions/min_terminated_length": 2899.0, "entropy": 0.25757203437387943, "epoch": 1.4634146341463414, "frac_reward_zero_std": 0.0, "grad_norm": 0.38143888115882874, "learning_rate": 2.8399999999999995e-07, "loss": 0.0341, "num_tokens": 60302987.0, "reward": 0.9845483303070068, "reward_std": 0.8336848616600037, "rewards/reward_func/mean": 0.9991662502288818, "rewards/reward_func/std": 0.8121254444122314, "rewards/soft_overlong_punishment_reward/mean": -0.014617919921875, "rewards/soft_overlong_punishment_reward/std": 0.08269143849611282, "sampling/importance_sampling_ratio/max": 2.355394124984741, "sampling/importance_sampling_ratio/mean": 0.990979790687561, "sampling/importance_sampling_ratio/min": 0.3311867415904999, "sampling/sampling_logp_difference/max": 1.1050728559494019, "sampling/sampling_logp_difference/mean": 0.018629489466547966, "step": 180, "step_time": 444.39584999810904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13861.0, "completions/max_terminated_length": 13861.0, "completions/mean_length": 8278.9375, "completions/mean_terminated_length": 8278.9375, "completions/min_length": 5209.0, "completions/min_terminated_length": 5209.0, "entropy": 0.305585864931345, "epoch": 1.4715447154471546, "frac_reward_zero_std": 0.25, "grad_norm": 0.40094515681266785, "learning_rate": 2.8e-07, "loss": 0.0255, "num_tokens": 60582913.0, "reward": 1.5346317291259766, "reward_std": 1.287915825843811, "rewards/reward_func/mean": 1.5346317291259766, "rewards/reward_func/std": 1.2879159450531006, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.060699701309204, "sampling/importance_sampling_ratio/mean": 0.9892938137054443, "sampling/importance_sampling_ratio/min": 0.18372268974781036, "sampling/sampling_logp_difference/max": 1.6943278312683105, "sampling/sampling_logp_difference/mean": 0.021300731226801872, "step": 181, "step_time": 307.09654345782474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15362.0, "completions/max_terminated_length": 15362.0, "completions/mean_length": 9544.125, "completions/mean_terminated_length": 9544.125, "completions/min_length": 6910.0, "completions/min_terminated_length": 6910.0, "entropy": 0.29761974327266216, "epoch": 1.4796747967479675, "frac_reward_zero_std": 0.0, "grad_norm": 0.4058556854724884, "learning_rate": 2.7600000000000004e-07, "loss": 0.0353, "num_tokens": 60904933.0, "reward": 1.4360902309417725, "reward_std": 1.2418161630630493, "rewards/reward_func/mean": 1.4360902309417725, "rewards/reward_func/std": 1.2418162822723389, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.379150629043579, "sampling/importance_sampling_ratio/mean": 0.9895496368408203, "sampling/importance_sampling_ratio/min": 0.0527670793235302, "sampling/sampling_logp_difference/max": 2.9418678283691406, "sampling/sampling_logp_difference/mean": 0.020994048565626144, "step": 182, "step_time": 385.816073252121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14594.0, "completions/max_terminated_length": 14594.0, "completions/mean_length": 10289.25, "completions/mean_terminated_length": 10289.25, "completions/min_length": 7779.0, "completions/min_terminated_length": 7779.0, "entropy": 0.27037277817726135, "epoch": 1.4878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.37331607937812805, "learning_rate": 2.72e-07, "loss": 0.0217, "num_tokens": 61268189.0, "reward": 0.7746168375015259, "reward_std": 0.6733230948448181, "rewards/reward_func/mean": 0.7746168375015259, "rewards/reward_func/std": 0.6733230948448181, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1012680530548096, "sampling/importance_sampling_ratio/mean": 0.990485668182373, "sampling/importance_sampling_ratio/min": 0.30445200204849243, "sampling/sampling_logp_difference/max": 1.189241886138916, "sampling/sampling_logp_difference/mean": 0.01935386285185814, "step": 183, "step_time": 546.588172652293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13994.0, "completions/max_terminated_length": 13994.0, "completions/mean_length": 9043.59375, "completions/mean_terminated_length": 9043.59375, "completions/min_length": 6681.0, "completions/min_terminated_length": 6681.0, "entropy": 0.29701266437768936, "epoch": 1.4959349593495934, "frac_reward_zero_std": 0.0, "grad_norm": 0.41282883286476135, "learning_rate": 2.68e-07, "loss": -0.0133, "num_tokens": 61573856.0, "reward": 1.3877496719360352, "reward_std": 0.6844647526741028, "rewards/reward_func/mean": 1.3877496719360352, "rewards/reward_func/std": 0.6844647526741028, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7882211208343506, "sampling/importance_sampling_ratio/mean": 0.9894493818283081, "sampling/importance_sampling_ratio/min": 0.3472015857696533, "sampling/sampling_logp_difference/max": 1.057849645614624, "sampling/sampling_logp_difference/mean": 0.02082856558263302, "step": 184, "step_time": 348.1651652737055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13691.0, "completions/max_terminated_length": 13691.0, "completions/mean_length": 8973.8125, "completions/mean_terminated_length": 8973.8125, "completions/min_length": 5585.0, "completions/min_terminated_length": 5585.0, "entropy": 0.28570630215108395, "epoch": 1.5040650406504064, "frac_reward_zero_std": 0.0, "grad_norm": 0.4094651937484741, "learning_rate": 2.64e-07, "loss": 0.0304, "num_tokens": 61875338.0, "reward": 1.4425820112228394, "reward_std": 1.3329182863235474, "rewards/reward_func/mean": 1.4425820112228394, "rewards/reward_func/std": 1.3329182863235474, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9900859594345093, "sampling/importance_sampling_ratio/min": 0.11671532690525055, "sampling/sampling_logp_difference/max": 2.148017406463623, "sampling/sampling_logp_difference/mean": 0.01998751237988472, "step": 185, "step_time": 517.4829101127107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15249.0, "completions/max_terminated_length": 15249.0, "completions/mean_length": 8211.6875, "completions/mean_terminated_length": 8211.6875, "completions/min_length": 4763.0, "completions/min_terminated_length": 4763.0, "entropy": 0.3177320044487715, "epoch": 1.5121951219512195, "frac_reward_zero_std": 0.0, "grad_norm": 0.9853414297103882, "learning_rate": 2.6e-07, "loss": -0.0002, "num_tokens": 62148608.0, "reward": 1.4485201835632324, "reward_std": 0.19182679057121277, "rewards/reward_func/mean": 1.4485201835632324, "rewards/reward_func/std": 0.1918267458677292, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7517563104629517, "sampling/importance_sampling_ratio/mean": 0.9886523485183716, "sampling/importance_sampling_ratio/min": 0.43156927824020386, "sampling/sampling_logp_difference/max": 0.840327262878418, "sampling/sampling_logp_difference/mean": 0.02235923707485199, "step": 186, "step_time": 319.5429007699713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16617.0, "completions/max_terminated_length": 16617.0, "completions/mean_length": 8996.59375, "completions/mean_terminated_length": 8996.59375, "completions/min_length": 3658.0, "completions/min_terminated_length": 3658.0, "entropy": 0.2926332140341401, "epoch": 1.5203252032520327, "frac_reward_zero_std": 0.0, "grad_norm": 0.40174105763435364, "learning_rate": 2.56e-07, "loss": 0.0098, "num_tokens": 62451283.0, "reward": 1.507631778717041, "reward_std": 1.6578742265701294, "rewards/reward_func/mean": 1.5094094276428223, "rewards/reward_func/std": 1.6561721563339233, "rewards/soft_overlong_punishment_reward/mean": -0.00177764892578125, "rewards/soft_overlong_punishment_reward/std": 0.010055900551378727, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902303814888, "sampling/importance_sampling_ratio/min": 0.30990007519721985, "sampling/sampling_logp_difference/max": 1.633288860321045, "sampling/sampling_logp_difference/mean": 0.020211469382047653, "step": 187, "step_time": 336.45058575086296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16478.0, "completions/max_terminated_length": 16478.0, "completions/mean_length": 10577.6875, "completions/mean_terminated_length": 10577.6875, "completions/min_length": 7560.0, "completions/min_terminated_length": 7560.0, "entropy": 0.2819963004440069, "epoch": 1.5284552845528454, "frac_reward_zero_std": 0.0, "grad_norm": 0.37996959686279297, "learning_rate": 2.52e-07, "loss": 0.0144, "num_tokens": 62817409.0, "reward": 0.5808703303337097, "reward_std": 0.8241087198257446, "rewards/reward_func/mean": 0.5815874934196472, "rewards/reward_func/std": 0.8235760927200317, "rewards/soft_overlong_punishment_reward/mean": -0.0007171630859375, "rewards/soft_overlong_punishment_reward/std": 0.004056887235492468, "sampling/importance_sampling_ratio/max": 2.636267900466919, "sampling/importance_sampling_ratio/mean": 0.9898158311843872, "sampling/importance_sampling_ratio/min": 0.0027106255292892456, "sampling/sampling_logp_difference/max": 5.910575866699219, "sampling/sampling_logp_difference/mean": 0.02044173702597618, "step": 188, "step_time": 499.72255454421975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14677.0, "completions/max_terminated_length": 14677.0, "completions/mean_length": 9978.09375, "completions/mean_terminated_length": 9978.09375, "completions/min_length": 4882.0, "completions/min_terminated_length": 4882.0, "entropy": 0.2741777431219816, "epoch": 1.5365853658536586, "frac_reward_zero_std": 0.0, "grad_norm": 0.3784760534763336, "learning_rate": 2.48e-07, "loss": 0.013, "num_tokens": 63159460.0, "reward": 1.0896389484405518, "reward_std": 0.4881446063518524, "rewards/reward_func/mean": 1.0896389484405518, "rewards/reward_func/std": 0.4881446361541748, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9906512498855591, "sampling/importance_sampling_ratio/min": 0.29679495096206665, "sampling/sampling_logp_difference/max": 1.8686549663543701, "sampling/sampling_logp_difference/mean": 0.019687175750732422, "step": 189, "step_time": 319.22378592635505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15448.0, "completions/max_terminated_length": 15448.0, "completions/mean_length": 10634.625, "completions/mean_terminated_length": 10634.625, "completions/min_length": 6856.0, "completions/min_terminated_length": 6856.0, "entropy": 0.27850321400910616, "epoch": 1.5447154471544715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3891684114933014, "learning_rate": 2.4399999999999996e-07, "loss": 0.0268, "num_tokens": 63523312.0, "reward": 0.9600388407707214, "reward_std": 0.6232390403747559, "rewards/reward_func/mean": 0.9600388407707214, "rewards/reward_func/std": 0.6232389807701111, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9903505444526672, "sampling/importance_sampling_ratio/min": 0.1122722402215004, "sampling/sampling_logp_difference/max": 2.18682861328125, "sampling/sampling_logp_difference/mean": 0.0196834784001112, "step": 190, "step_time": 358.385191940004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12682.0, "completions/max_terminated_length": 12682.0, "completions/mean_length": 7577.3125, "completions/mean_terminated_length": 7577.3125, "completions/min_length": 1562.0, "completions/min_terminated_length": 1562.0, "entropy": 0.31117005459964275, "epoch": 1.5528455284552845, "frac_reward_zero_std": 0.0, "grad_norm": 0.4534858167171478, "learning_rate": 2.4e-07, "loss": -0.0218, "num_tokens": 63777218.0, "reward": 1.481154203414917, "reward_std": 0.709784984588623, "rewards/reward_func/mean": 1.481154203414917, "rewards/reward_func/std": 0.7097850441932678, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9891089797019958, "sampling/importance_sampling_ratio/min": 0.2799838185310364, "sampling/sampling_logp_difference/max": 1.3334325551986694, "sampling/sampling_logp_difference/mean": 0.021504439413547516, "step": 191, "step_time": 333.1857100597117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12243.0, "completions/max_terminated_length": 12243.0, "completions/mean_length": 9093.09375, "completions/mean_terminated_length": 9093.09375, "completions/min_length": 5665.0, "completions/min_terminated_length": 5665.0, "entropy": 0.284919373691082, "epoch": 1.5609756097560976, "frac_reward_zero_std": 0.0, "grad_norm": 0.40635332465171814, "learning_rate": 2.3599999999999997e-07, "loss": -0.0044, "num_tokens": 64081693.0, "reward": 1.447981834411621, "reward_std": 1.2502341270446777, "rewards/reward_func/mean": 1.447981834411621, "rewards/reward_func/std": 1.2502341270446777, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4054501056671143, "sampling/importance_sampling_ratio/mean": 0.9902962446212769, "sampling/importance_sampling_ratio/min": 0.06248319521546364, "sampling/sampling_logp_difference/max": 2.772857666015625, "sampling/sampling_logp_difference/mean": 0.01986866444349289, "step": 192, "step_time": 292.0563309621066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15147.0, "completions/max_terminated_length": 15147.0, "completions/mean_length": 10851.03125, "completions/mean_terminated_length": 10851.03125, "completions/min_length": 7781.0, "completions/min_terminated_length": 7781.0, "entropy": 0.29965684935450554, "epoch": 1.5691056910569106, "frac_reward_zero_std": 0.0, "grad_norm": 0.3997621536254883, "learning_rate": 2.32e-07, "loss": -0.0378, "num_tokens": 64449766.0, "reward": 0.8266834020614624, "reward_std": 0.6421800851821899, "rewards/reward_func/mean": 0.8266834020614624, "rewards/reward_func/std": 0.6421800255775452, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9896982908248901, "sampling/importance_sampling_ratio/min": 0.00025375522091053426, "sampling/sampling_logp_difference/max": 8.27914047241211, "sampling/sampling_logp_difference/mean": 0.021142274141311646, "step": 193, "step_time": 365.25503698992543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15063.0, "completions/max_terminated_length": 15063.0, "completions/mean_length": 10946.09375, "completions/mean_terminated_length": 10946.09375, "completions/min_length": 6879.0, "completions/min_terminated_length": 6879.0, "entropy": 0.31737131997942924, "epoch": 1.5772357723577235, "frac_reward_zero_std": 0.0, "grad_norm": 0.8579146265983582, "learning_rate": 2.28e-07, "loss": 0.0609, "num_tokens": 64812785.0, "reward": 1.1919212341308594, "reward_std": 0.6313185691833496, "rewards/reward_func/mean": 1.1919212341308594, "rewards/reward_func/std": 0.6313185691833496, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2314682006835938, "sampling/importance_sampling_ratio/mean": 0.989063024520874, "sampling/importance_sampling_ratio/min": 0.3655092716217041, "sampling/sampling_logp_difference/max": 1.006463646888733, "sampling/sampling_logp_difference/mean": 0.02223176136612892, "step": 194, "step_time": 351.5693916489836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14915.0, "completions/max_terminated_length": 14915.0, "completions/mean_length": 10867.5625, "completions/mean_terminated_length": 10867.5625, "completions/min_length": 5934.0, "completions/min_terminated_length": 5934.0, "entropy": 0.2743126470595598, "epoch": 1.5853658536585367, "frac_reward_zero_std": 0.0, "grad_norm": 0.36443397402763367, "learning_rate": 2.24e-07, "loss": 0.0046, "num_tokens": 65179611.0, "reward": 1.07879638671875, "reward_std": 0.6518019437789917, "rewards/reward_func/mean": 1.07879638671875, "rewards/reward_func/std": 0.6518019437789917, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1472198963165283, "sampling/importance_sampling_ratio/mean": 0.9904861450195312, "sampling/importance_sampling_ratio/min": 0.10317613184452057, "sampling/sampling_logp_difference/max": 2.271317720413208, "sampling/sampling_logp_difference/mean": 0.01932957023382187, "step": 195, "step_time": 342.88379572634585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14422.0, "completions/max_terminated_length": 14422.0, "completions/mean_length": 9513.5625, "completions/mean_terminated_length": 9513.5625, "completions/min_length": 3494.0, "completions/min_terminated_length": 3494.0, "entropy": 0.28421447053551674, "epoch": 1.5934959349593496, "frac_reward_zero_std": 0.0, "grad_norm": 0.40306708216667175, "learning_rate": 2.1999999999999998e-07, "loss": 0.0735, "num_tokens": 65502101.0, "reward": 1.6892921924591064, "reward_std": 1.7240147590637207, "rewards/reward_func/mean": 1.6892921924591064, "rewards/reward_func/std": 1.7240146398544312, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9903287887573242, "sampling/importance_sampling_ratio/min": 0.06248319521546364, "sampling/sampling_logp_difference/max": 2.772857666015625, "sampling/sampling_logp_difference/mean": 0.019995655864477158, "step": 196, "step_time": 333.16898891096935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15583.0, "completions/max_terminated_length": 15583.0, "completions/mean_length": 11488.0625, "completions/mean_terminated_length": 11488.0625, "completions/min_length": 8134.0, "completions/min_terminated_length": 8134.0, "entropy": 0.27682713977992535, "epoch": 1.6016260162601625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3545663356781006, "learning_rate": 2.1599999999999998e-07, "loss": 0.0278, "num_tokens": 65893255.0, "reward": 1.7090435028076172, "reward_std": 1.8992475271224976, "rewards/reward_func/mean": 1.7090435028076172, "rewards/reward_func/std": 1.899247646331787, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902373552322388, "sampling/importance_sampling_ratio/min": 0.2602575421333313, "sampling/sampling_logp_difference/max": 1.346083641052246, "sampling/sampling_logp_difference/mean": 0.01981320045888424, "step": 197, "step_time": 370.34518124256283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12766.0, "completions/max_terminated_length": 12766.0, "completions/mean_length": 9726.1875, "completions/mean_terminated_length": 9726.1875, "completions/min_length": 4384.0, "completions/min_terminated_length": 4384.0, "entropy": 0.2942858459427953, "epoch": 1.6097560975609757, "frac_reward_zero_std": 0.25, "grad_norm": 0.34428897500038147, "learning_rate": 2.12e-07, "loss": 0.0091, "num_tokens": 66231221.0, "reward": 0.8734248280525208, "reward_std": 0.6437625288963318, "rewards/reward_func/mean": 0.8734248280525208, "rewards/reward_func/std": 0.6437625288963318, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2832908630371094, "sampling/importance_sampling_ratio/mean": 0.9896547794342041, "sampling/importance_sampling_ratio/min": 0.2729717493057251, "sampling/sampling_logp_difference/max": 1.298387050628662, "sampling/sampling_logp_difference/mean": 0.020584698766469955, "step": 198, "step_time": 540.5545430611819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12849.0, "completions/max_terminated_length": 12849.0, "completions/mean_length": 9709.1875, "completions/mean_terminated_length": 9709.1875, "completions/min_length": 6938.0, "completions/min_terminated_length": 6938.0, "entropy": 0.29280679672956467, "epoch": 1.6178861788617886, "frac_reward_zero_std": 0.0, "grad_norm": 0.40767934918403625, "learning_rate": 2.0799999999999998e-07, "loss": 0.0156, "num_tokens": 66560883.0, "reward": 0.6801372766494751, "reward_std": 0.6403651237487793, "rewards/reward_func/mean": 0.6801372766494751, "rewards/reward_func/std": 0.6403650641441345, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7840784788131714, "sampling/importance_sampling_ratio/mean": 0.989808201789856, "sampling/importance_sampling_ratio/min": 0.3189617097377777, "sampling/sampling_logp_difference/max": 1.1426842212677002, "sampling/sampling_logp_difference/mean": 0.020620396360754967, "step": 199, "step_time": 288.9684821246192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15979.0, "completions/max_terminated_length": 15979.0, "completions/mean_length": 10304.9375, "completions/mean_terminated_length": 10304.9375, "completions/min_length": 3889.0, "completions/min_terminated_length": 3889.0, "entropy": 0.27815717831254005, "epoch": 1.6260162601626016, "frac_reward_zero_std": 0.0, "grad_norm": 0.38130983710289, "learning_rate": 2.0399999999999997e-07, "loss": 0.0202, "num_tokens": 66909617.0, "reward": 1.4418128728866577, "reward_std": 1.1881235837936401, "rewards/reward_func/mean": 1.4418128728866577, "rewards/reward_func/std": 1.1881235837936401, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2451488971710205, "sampling/importance_sampling_ratio/mean": 0.9904321432113647, "sampling/importance_sampling_ratio/min": 0.3691475987434387, "sampling/sampling_logp_difference/max": 0.9965587258338928, "sampling/sampling_logp_difference/mean": 0.019482022151350975, "step": 200, "step_time": 518.1173423542641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13914.0, "completions/max_terminated_length": 13914.0, "completions/mean_length": 9100.125, "completions/mean_terminated_length": 9100.125, "completions/min_length": 5304.0, "completions/min_terminated_length": 5304.0, "entropy": 0.27993758488446474, "epoch": 1.6341463414634148, "frac_reward_zero_std": 0.0, "grad_norm": 0.374416708946228, "learning_rate": 2e-07, "loss": 0.0004, "num_tokens": 67219981.0, "reward": 1.3626954555511475, "reward_std": 0.2722192108631134, "rewards/reward_func/mean": 1.3626954555511475, "rewards/reward_func/std": 0.2722192406654358, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.924107074737549, "sampling/importance_sampling_ratio/mean": 0.9901722073554993, "sampling/importance_sampling_ratio/min": 0.24207624793052673, "sampling/sampling_logp_difference/max": 1.4185025691986084, "sampling/sampling_logp_difference/mean": 0.01998857781291008, "step": 201, "step_time": 314.7476293069776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13731.0, "completions/max_terminated_length": 13731.0, "completions/mean_length": 9330.75, "completions/mean_terminated_length": 9330.75, "completions/min_length": 6053.0, "completions/min_terminated_length": 6053.0, "entropy": 0.2888748459517956, "epoch": 1.6422764227642277, "frac_reward_zero_std": 0.0, "grad_norm": 0.3868502974510193, "learning_rate": 1.96e-07, "loss": 0.007, "num_tokens": 67532997.0, "reward": 1.3977718353271484, "reward_std": 0.4272344708442688, "rewards/reward_func/mean": 1.3977718353271484, "rewards/reward_func/std": 0.4272345006465912, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.656132936477661, "sampling/importance_sampling_ratio/mean": 0.9900428652763367, "sampling/importance_sampling_ratio/min": 0.12062080949544907, "sampling/sampling_logp_difference/max": 2.1151034832000732, "sampling/sampling_logp_difference/mean": 0.020267413929104805, "step": 202, "step_time": 308.19105943036266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14783.0, "completions/max_terminated_length": 14783.0, "completions/mean_length": 10748.21875, "completions/mean_terminated_length": 10748.21875, "completions/min_length": 6890.0, "completions/min_terminated_length": 6890.0, "entropy": 0.2793641071766615, "epoch": 1.6504065040650406, "frac_reward_zero_std": 0.0, "grad_norm": 0.5265854001045227, "learning_rate": 1.92e-07, "loss": -0.0092, "num_tokens": 67896876.0, "reward": 1.05764901638031, "reward_std": 0.5356566309928894, "rewards/reward_func/mean": 1.05764901638031, "rewards/reward_func/std": 0.5356566309928894, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9288419485092163, "sampling/importance_sampling_ratio/mean": 0.9903679490089417, "sampling/importance_sampling_ratio/min": 0.373412162065506, "sampling/sampling_logp_difference/max": 0.9850724935531616, "sampling/sampling_logp_difference/mean": 0.019830968230962753, "step": 203, "step_time": 414.82067601103336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16746.0, "completions/max_terminated_length": 16746.0, "completions/mean_length": 12129.65625, "completions/mean_terminated_length": 12129.65625, "completions/min_length": 7692.0, "completions/min_terminated_length": 7692.0, "entropy": 0.25799091067165136, "epoch": 1.6585365853658538, "frac_reward_zero_std": 0.0, "grad_norm": 0.36031574010849, "learning_rate": 1.88e-07, "loss": 0.0602, "num_tokens": 68311737.0, "reward": 1.1385812759399414, "reward_std": 0.9196106791496277, "rewards/reward_func/mean": 1.1421899795532227, "rewards/reward_func/std": 0.9190824627876282, "rewards/soft_overlong_punishment_reward/mean": -0.00360870361328125, "rewards/soft_overlong_punishment_reward/std": 0.016192881390452385, "sampling/importance_sampling_ratio/max": 2.586771011352539, "sampling/importance_sampling_ratio/mean": 0.9909306168556213, "sampling/importance_sampling_ratio/min": 0.07289119064807892, "sampling/sampling_logp_difference/max": 2.6187875270843506, "sampling/sampling_logp_difference/mean": 0.018782958388328552, "step": 204, "step_time": 588.7674697954208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14103.0, "completions/max_terminated_length": 14103.0, "completions/mean_length": 9646.96875, "completions/mean_terminated_length": 9646.96875, "completions/min_length": 5721.0, "completions/min_terminated_length": 5721.0, "entropy": 0.2763592302799225, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.3763327896595001, "learning_rate": 1.8399999999999998e-07, "loss": 0.0166, "num_tokens": 68639720.0, "reward": 1.1393167972564697, "reward_std": 0.49497222900390625, "rewards/reward_func/mean": 1.1393167972564697, "rewards/reward_func/std": 0.49497222900390625, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4610517024993896, "sampling/importance_sampling_ratio/mean": 0.9903528094291687, "sampling/importance_sampling_ratio/min": 0.13930465281009674, "sampling/sampling_logp_difference/max": 1.9710919857025146, "sampling/sampling_logp_difference/mean": 0.019617252051830292, "step": 205, "step_time": 339.363127422519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13168.0, "completions/max_terminated_length": 13168.0, "completions/mean_length": 8321.0625, "completions/mean_terminated_length": 8321.0625, "completions/min_length": 4609.0, "completions/min_terminated_length": 4609.0, "entropy": 0.29074352979660034, "epoch": 1.6747967479674797, "frac_reward_zero_std": 0.0, "grad_norm": 0.46806299686431885, "learning_rate": 1.8e-07, "loss": -0.0141, "num_tokens": 68923034.0, "reward": 1.2282514572143555, "reward_std": 0.945728063583374, "rewards/reward_func/mean": 1.2282514572143555, "rewards/reward_func/std": 0.945728063583374, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9357402324676514, "sampling/importance_sampling_ratio/mean": 0.9899877309799194, "sampling/importance_sampling_ratio/min": 0.004849388264119625, "sampling/sampling_logp_difference/max": 5.328902721405029, "sampling/sampling_logp_difference/mean": 0.019961174577474594, "step": 206, "step_time": 335.27952317520976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14498.0, "completions/max_terminated_length": 14498.0, "completions/mean_length": 9859.0, "completions/mean_terminated_length": 9859.0, "completions/min_length": 7080.0, "completions/min_terminated_length": 7080.0, "entropy": 0.2817118000239134, "epoch": 1.6829268292682928, "frac_reward_zero_std": 0.0, "grad_norm": 0.41321122646331787, "learning_rate": 1.76e-07, "loss": -0.0156, "num_tokens": 69258610.0, "reward": 1.5615112781524658, "reward_std": 1.2650940418243408, "rewards/reward_func/mean": 1.5615112781524658, "rewards/reward_func/std": 1.2650940418243408, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.48781681060791, "sampling/importance_sampling_ratio/mean": 0.9902306795120239, "sampling/importance_sampling_ratio/min": 0.09276595711708069, "sampling/sampling_logp_difference/max": 2.3776755332946777, "sampling/sampling_logp_difference/mean": 0.01979946345090866, "step": 207, "step_time": 412.5895248707384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12872.0, "completions/max_terminated_length": 12872.0, "completions/mean_length": 8875.53125, "completions/mean_terminated_length": 8875.53125, "completions/min_length": 4162.0, "completions/min_terminated_length": 4162.0, "entropy": 0.28253381699323654, "epoch": 1.6910569105691056, "frac_reward_zero_std": 0.0, "grad_norm": 0.3998742699623108, "learning_rate": 1.7199999999999998e-07, "loss": -0.0407, "num_tokens": 69556795.0, "reward": 1.1288670301437378, "reward_std": 0.5693311095237732, "rewards/reward_func/mean": 1.1288670301437378, "rewards/reward_func/std": 0.5693311095237732, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901490211486816, "sampling/importance_sampling_ratio/min": 0.33106470108032227, "sampling/sampling_logp_difference/max": 1.128915548324585, "sampling/sampling_logp_difference/mean": 0.019885778427124023, "step": 208, "step_time": 287.43813130888157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14729.0, "completions/max_terminated_length": 14729.0, "completions/mean_length": 10587.9375, "completions/mean_terminated_length": 10587.9375, "completions/min_length": 7189.0, "completions/min_terminated_length": 7189.0, "entropy": 0.2850116267800331, "epoch": 1.6991869918699187, "frac_reward_zero_std": 0.0, "grad_norm": 0.37196865677833557, "learning_rate": 1.68e-07, "loss": 0.0645, "num_tokens": 69912961.0, "reward": 1.4343760013580322, "reward_std": 1.080871343612671, "rewards/reward_func/mean": 1.4343760013580322, "rewards/reward_func/std": 1.080871343612671, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901971220970154, "sampling/importance_sampling_ratio/min": 0.3677317798137665, "sampling/sampling_logp_difference/max": 1.0991556644439697, "sampling/sampling_logp_difference/mean": 0.02008083090186119, "step": 209, "step_time": 352.02227776590735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15064.0, "completions/max_terminated_length": 15064.0, "completions/mean_length": 11466.5, "completions/mean_terminated_length": 11466.5, "completions/min_length": 8096.0, "completions/min_terminated_length": 8096.0, "entropy": 0.25724725145846605, "epoch": 1.7073170731707317, "frac_reward_zero_std": 0.0, "grad_norm": 0.3804565668106079, "learning_rate": 1.64e-07, "loss": -0.0099, "num_tokens": 70307169.0, "reward": 1.047875165939331, "reward_std": 0.6655042767524719, "rewards/reward_func/mean": 1.047875165939331, "rewards/reward_func/std": 0.6655042767524719, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.398998260498047, "sampling/importance_sampling_ratio/mean": 0.9909435510635376, "sampling/importance_sampling_ratio/min": 0.037961445748806, "sampling/sampling_logp_difference/max": 3.271184206008911, "sampling/sampling_logp_difference/mean": 0.01853536069393158, "step": 210, "step_time": 377.21587240928784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18850.0, "completions/max_terminated_length": 18850.0, "completions/mean_length": 11776.53125, "completions/mean_terminated_length": 11776.53125, "completions/min_length": 8714.0, "completions/min_terminated_length": 8714.0, "entropy": 0.26618711929768324, "epoch": 1.7154471544715446, "frac_reward_zero_std": 0.0, "grad_norm": 0.34400245547294617, "learning_rate": 1.6e-07, "loss": 0.0338, "num_tokens": 70713522.0, "reward": 0.792148232460022, "reward_std": 0.7291855812072754, "rewards/reward_func/mean": 0.8139911890029907, "rewards/reward_func/std": 0.701914370059967, "rewards/soft_overlong_punishment_reward/mean": -0.02184295654296875, "rewards/soft_overlong_punishment_reward/std": 0.10657885670661926, "sampling/importance_sampling_ratio/max": 2.09429669380188, "sampling/importance_sampling_ratio/mean": 0.9909163117408752, "sampling/importance_sampling_ratio/min": 0.15467022359371185, "sampling/sampling_logp_difference/max": 1.8664600849151611, "sampling/sampling_logp_difference/mean": 0.018423916772007942, "step": 211, "step_time": 408.3389090951532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14657.0, "completions/max_terminated_length": 14657.0, "completions/mean_length": 10683.625, "completions/mean_terminated_length": 10683.625, "completions/min_length": 5483.0, "completions/min_terminated_length": 5483.0, "entropy": 0.2850087806582451, "epoch": 1.7235772357723578, "frac_reward_zero_std": 0.0, "grad_norm": 0.3700617551803589, "learning_rate": 1.56e-07, "loss": -0.0044, "num_tokens": 71079998.0, "reward": 1.0544310808181763, "reward_std": 0.8633791208267212, "rewards/reward_func/mean": 1.0544310808181763, "rewards/reward_func/std": 0.8633792400360107, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0554580688476562, "sampling/importance_sampling_ratio/mean": 0.990363597869873, "sampling/importance_sampling_ratio/min": 0.038124918937683105, "sampling/sampling_logp_difference/max": 3.2668871879577637, "sampling/sampling_logp_difference/mean": 0.019684813916683197, "step": 212, "step_time": 422.1297117122449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14373.0, "completions/max_terminated_length": 14373.0, "completions/mean_length": 9534.375, "completions/mean_terminated_length": 9534.375, "completions/min_length": 2458.0, "completions/min_terminated_length": 2458.0, "entropy": 0.28476100601255894, "epoch": 1.7317073170731707, "frac_reward_zero_std": 0.0, "grad_norm": 0.4524003267288208, "learning_rate": 1.5199999999999998e-07, "loss": -0.031, "num_tokens": 71408890.0, "reward": 0.8519082069396973, "reward_std": 0.6510614156723022, "rewards/reward_func/mean": 0.8519082069396973, "rewards/reward_func/std": 0.651061475276947, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9415626525878906, "sampling/importance_sampling_ratio/mean": 0.9903674125671387, "sampling/importance_sampling_ratio/min": 0.015267021022737026, "sampling/sampling_logp_difference/max": 4.182060241699219, "sampling/sampling_logp_difference/mean": 0.019843295216560364, "step": 213, "step_time": 335.98301385412924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15211.0, "completions/max_terminated_length": 15211.0, "completions/mean_length": 9717.375, "completions/mean_terminated_length": 9717.375, "completions/min_length": 5821.0, "completions/min_terminated_length": 5821.0, "entropy": 0.2980188447982073, "epoch": 1.7398373983739837, "frac_reward_zero_std": 0.0, "grad_norm": 0.3987669348716736, "learning_rate": 1.4799999999999998e-07, "loss": 0.0109, "num_tokens": 71739526.0, "reward": 1.5768582820892334, "reward_std": 0.8727981448173523, "rewards/reward_func/mean": 1.5768582820892334, "rewards/reward_func/std": 0.8727982044219971, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9979711771011353, "sampling/importance_sampling_ratio/mean": 0.9897875785827637, "sampling/importance_sampling_ratio/min": 0.42823854088783264, "sampling/sampling_logp_difference/max": 0.8480749130249023, "sampling/sampling_logp_difference/mean": 0.020624473690986633, "step": 214, "step_time": 444.9323614304885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14265.0, "completions/max_terminated_length": 14265.0, "completions/mean_length": 10103.03125, "completions/mean_terminated_length": 10103.03125, "completions/min_length": 5647.0, "completions/min_terminated_length": 5647.0, "entropy": 0.293839693069458, "epoch": 1.7479674796747968, "frac_reward_zero_std": 0.0, "grad_norm": 0.3844790756702423, "learning_rate": 1.44e-07, "loss": -0.0244, "num_tokens": 72078767.0, "reward": 2.067140817642212, "reward_std": 2.6612210273742676, "rewards/reward_func/mean": 2.067140817642212, "rewards/reward_func/std": 2.6612212657928467, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9357402324676514, "sampling/importance_sampling_ratio/mean": 0.9895575046539307, "sampling/importance_sampling_ratio/min": 0.04509971663355827, "sampling/sampling_logp_difference/max": 3.098879337310791, "sampling/sampling_logp_difference/mean": 0.020683787763118744, "step": 215, "step_time": 401.31944015505724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17210.0, "completions/max_terminated_length": 17210.0, "completions/mean_length": 9679.5625, "completions/mean_terminated_length": 9679.5625, "completions/min_length": 3255.0, "completions/min_terminated_length": 3255.0, "entropy": 0.283124684356153, "epoch": 1.7560975609756098, "frac_reward_zero_std": 0.0, "grad_norm": 0.3641822934150696, "learning_rate": 1.4e-07, "loss": 0.0163, "num_tokens": 72407545.0, "reward": 0.8389718532562256, "reward_std": 0.8429340124130249, "rewards/reward_func/mean": 0.8452737331390381, "rewards/reward_func/std": 0.8356252908706665, "rewards/soft_overlong_punishment_reward/mean": -0.0063018798828125, "rewards/soft_overlong_punishment_reward/std": 0.03564881905913353, "sampling/importance_sampling_ratio/max": 2.0940728187561035, "sampling/importance_sampling_ratio/mean": 0.9901467561721802, "sampling/importance_sampling_ratio/min": 0.021902216598391533, "sampling/sampling_logp_difference/max": 3.821167469024658, "sampling/sampling_logp_difference/mean": 0.0196835920214653, "step": 216, "step_time": 491.90342234587297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15762.0, "completions/max_terminated_length": 15762.0, "completions/mean_length": 10206.5, "completions/mean_terminated_length": 10206.5, "completions/min_length": 7317.0, "completions/min_terminated_length": 7317.0, "entropy": 0.28364065289497375, "epoch": 1.7642276422764227, "frac_reward_zero_std": 0.0, "grad_norm": 0.38103634119033813, "learning_rate": 1.36e-07, "loss": -0.0001, "num_tokens": 72752481.0, "reward": 1.2134435176849365, "reward_std": 0.40889275074005127, "rewards/reward_func/mean": 1.2134435176849365, "rewards/reward_func/std": 0.40889275074005127, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9898847341537476, "sampling/importance_sampling_ratio/min": 0.153355211019516, "sampling/sampling_logp_difference/max": 1.8749984502792358, "sampling/sampling_logp_difference/mean": 0.020069871097803116, "step": 217, "step_time": 341.48434051219374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16974.0, "completions/max_terminated_length": 16974.0, "completions/mean_length": 9566.84375, "completions/mean_terminated_length": 9566.84375, "completions/min_length": 5823.0, "completions/min_terminated_length": 5823.0, "entropy": 0.2919788621366024, "epoch": 1.7723577235772359, "frac_reward_zero_std": 0.0, "grad_norm": 0.36829307675361633, "learning_rate": 1.32e-07, "loss": 0.022, "num_tokens": 73086028.0, "reward": 1.9240275621414185, "reward_std": 2.6115894317626953, "rewards/reward_func/mean": 1.928528904914856, "rewards/reward_func/std": 2.6107239723205566, "rewards/soft_overlong_punishment_reward/mean": -0.0045013427734375, "rewards/soft_overlong_punishment_reward/std": 0.025463439524173737, "sampling/importance_sampling_ratio/max": 2.1823227405548096, "sampling/importance_sampling_ratio/mean": 0.9898126125335693, "sampling/importance_sampling_ratio/min": 0.03777245059609413, "sampling/sampling_logp_difference/max": 3.2761752605438232, "sampling/sampling_logp_difference/mean": 0.020412269979715347, "step": 218, "step_time": 350.4455557651818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15711.0, "completions/max_terminated_length": 15711.0, "completions/mean_length": 11780.21875, "completions/mean_terminated_length": 11780.21875, "completions/min_length": 6152.0, "completions/min_terminated_length": 6152.0, "entropy": 0.24620082695037127, "epoch": 1.7804878048780488, "frac_reward_zero_std": 0.0, "grad_norm": 0.3344440460205078, "learning_rate": 1.28e-07, "loss": -0.0446, "num_tokens": 73489971.0, "reward": 0.8368304967880249, "reward_std": 1.1214935779571533, "rewards/reward_func/mean": 0.8368304967880249, "rewards/reward_func/std": 1.1214934587478638, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.538177490234375, "sampling/importance_sampling_ratio/mean": 0.9915289878845215, "sampling/importance_sampling_ratio/min": 0.10874148458242416, "sampling/sampling_logp_difference/max": 2.2187819480895996, "sampling/sampling_logp_difference/mean": 0.01787884533405304, "step": 219, "step_time": 595.3615495576523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14569.0, "completions/max_terminated_length": 14569.0, "completions/mean_length": 10509.15625, "completions/mean_terminated_length": 10509.15625, "completions/min_length": 6042.0, "completions/min_terminated_length": 6042.0, "entropy": 0.27453439868986607, "epoch": 1.7886178861788617, "frac_reward_zero_std": 0.0, "grad_norm": 0.3674945533275604, "learning_rate": 1.24e-07, "loss": 0.0284, "num_tokens": 73844368.0, "reward": 0.9465268850326538, "reward_std": 0.6327102184295654, "rewards/reward_func/mean": 0.9465268850326538, "rewards/reward_func/std": 0.6327102184295654, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9903506636619568, "sampling/importance_sampling_ratio/min": 0.3419025242328644, "sampling/sampling_logp_difference/max": 1.3342759609222412, "sampling/sampling_logp_difference/mean": 0.019706957042217255, "step": 220, "step_time": 572.6308616143651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13662.0, "completions/max_terminated_length": 13662.0, "completions/mean_length": 8802.75, "completions/mean_terminated_length": 8802.75, "completions/min_length": 3935.0, "completions/min_terminated_length": 3935.0, "entropy": 0.27091467659920454, "epoch": 1.796747967479675, "frac_reward_zero_std": 0.0, "grad_norm": 0.4277440905570984, "learning_rate": 1.2e-07, "loss": 0.055, "num_tokens": 74147944.0, "reward": 1.7628746032714844, "reward_std": 1.1854737997055054, "rewards/reward_func/mean": 1.7628746032714844, "rewards/reward_func/std": 1.1854737997055054, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.934642791748047, "sampling/importance_sampling_ratio/mean": 0.9906680583953857, "sampling/importance_sampling_ratio/min": 0.030376434326171875, "sampling/sampling_logp_difference/max": 3.4940881729125977, "sampling/sampling_logp_difference/mean": 0.01885787770152092, "step": 221, "step_time": 350.9161086077802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12490.0, "completions/max_terminated_length": 12490.0, "completions/mean_length": 7828.4375, "completions/mean_terminated_length": 7828.4375, "completions/min_length": 2446.0, "completions/min_terminated_length": 2446.0, "entropy": 0.2959641069173813, "epoch": 1.8048780487804879, "frac_reward_zero_std": 0.0, "grad_norm": 0.4303174614906311, "learning_rate": 1.16e-07, "loss": 0.0128, "num_tokens": 74414678.0, "reward": 1.5567618608474731, "reward_std": 1.3865270614624023, "rewards/reward_func/mean": 1.5567618608474731, "rewards/reward_func/std": 1.386527180671692, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5652806758880615, "sampling/importance_sampling_ratio/mean": 0.9897814989089966, "sampling/importance_sampling_ratio/min": 0.29385438561439514, "sampling/sampling_logp_difference/max": 1.2246708869934082, "sampling/sampling_logp_difference/mean": 0.020555168390274048, "step": 222, "step_time": 305.8508344134316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13579.0, "completions/max_terminated_length": 13579.0, "completions/mean_length": 10411.9375, "completions/mean_terminated_length": 10411.9375, "completions/min_length": 7539.0, "completions/min_terminated_length": 7539.0, "entropy": 0.2726015029475093, "epoch": 1.8130081300813008, "frac_reward_zero_std": 0.0, "grad_norm": 0.35976549983024597, "learning_rate": 1.12e-07, "loss": -0.0198, "num_tokens": 74773260.0, "reward": 1.0551838874816895, "reward_std": 0.5486001968383789, "rewards/reward_func/mean": 1.0551838874816895, "rewards/reward_func/std": 0.5486001968383789, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9905358552932739, "sampling/importance_sampling_ratio/min": 0.10280817747116089, "sampling/sampling_logp_difference/max": 2.274890422821045, "sampling/sampling_logp_difference/mean": 0.019155049696564674, "step": 223, "step_time": 345.6332434962969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15476.0, "completions/max_terminated_length": 15476.0, "completions/mean_length": 10071.25, "completions/mean_terminated_length": 10071.25, "completions/min_length": 5875.0, "completions/min_terminated_length": 5875.0, "entropy": 0.2811074396595359, "epoch": 1.821138211382114, "frac_reward_zero_std": 0.0, "grad_norm": 0.4005601704120636, "learning_rate": 1.0799999999999999e-07, "loss": 0.0105, "num_tokens": 75125052.0, "reward": 1.0827958583831787, "reward_std": 0.6487172842025757, "rewards/reward_func/mean": 1.0827958583831787, "rewards/reward_func/std": 0.6487172842025757, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8473902940750122, "sampling/importance_sampling_ratio/mean": 0.990472674369812, "sampling/importance_sampling_ratio/min": 0.23046469688415527, "sampling/sampling_logp_difference/max": 1.4676575660705566, "sampling/sampling_logp_difference/mean": 0.019458888098597527, "step": 224, "step_time": 387.86132500413805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14500.0, "completions/max_terminated_length": 14500.0, "completions/mean_length": 10911.75, "completions/mean_terminated_length": 10911.75, "completions/min_length": 7969.0, "completions/min_terminated_length": 7969.0, "entropy": 0.261114988476038, "epoch": 1.8292682926829267, "frac_reward_zero_std": 0.0, "grad_norm": 0.3554852306842804, "learning_rate": 1.0399999999999999e-07, "loss": -0.0222, "num_tokens": 75501012.0, "reward": 1.656463623046875, "reward_std": 1.6964542865753174, "rewards/reward_func/mean": 1.656463623046875, "rewards/reward_func/std": 1.6964542865753174, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8839166164398193, "sampling/importance_sampling_ratio/mean": 0.9908844828605652, "sampling/importance_sampling_ratio/min": 0.188667431473732, "sampling/sampling_logp_difference/max": 1.667769432067871, "sampling/sampling_logp_difference/mean": 0.018746506422758102, "step": 225, "step_time": 437.71810589171946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15035.0, "completions/max_terminated_length": 15035.0, "completions/mean_length": 9346.78125, "completions/mean_terminated_length": 9346.78125, "completions/min_length": 5479.0, "completions/min_terminated_length": 5479.0, "entropy": 0.3042516093701124, "epoch": 1.8373983739837398, "frac_reward_zero_std": 0.0, "grad_norm": 0.42573827505111694, "learning_rate": 1e-07, "loss": -0.0004, "num_tokens": 75815341.0, "reward": 1.1006791591644287, "reward_std": 0.7395225167274475, "rewards/reward_func/mean": 1.1006791591644287, "rewards/reward_func/std": 0.7395225167274475, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.487826347351074, "sampling/importance_sampling_ratio/mean": 0.9894092082977295, "sampling/importance_sampling_ratio/min": 0.45079898834228516, "sampling/sampling_logp_difference/max": 0.9114093780517578, "sampling/sampling_logp_difference/mean": 0.021217308938503265, "step": 226, "step_time": 330.92210000101477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14943.0, "completions/max_terminated_length": 14943.0, "completions/mean_length": 9735.6875, "completions/mean_terminated_length": 9735.6875, "completions/min_length": 5857.0, "completions/min_terminated_length": 5857.0, "entropy": 0.3187424521893263, "epoch": 1.845528455284553, "frac_reward_zero_std": 0.0, "grad_norm": 0.4303300678730011, "learning_rate": 9.6e-08, "loss": 0.0728, "num_tokens": 76140595.0, "reward": 0.9577392339706421, "reward_std": 0.7074591517448425, "rewards/reward_func/mean": 0.9577392339706421, "rewards/reward_func/std": 0.7074590921401978, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.359483003616333, "sampling/importance_sampling_ratio/mean": 0.9887793660163879, "sampling/importance_sampling_ratio/min": 0.19865137338638306, "sampling/sampling_logp_difference/max": 1.6162039041519165, "sampling/sampling_logp_difference/mean": 0.022178202867507935, "step": 227, "step_time": 337.2866434077732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14953.0, "completions/max_terminated_length": 14953.0, "completions/mean_length": 9333.96875, "completions/mean_terminated_length": 9333.96875, "completions/min_length": 4264.0, "completions/min_terminated_length": 4264.0, "entropy": 0.28454318083822727, "epoch": 1.8536585365853657, "frac_reward_zero_std": 0.0, "grad_norm": 0.398489773273468, "learning_rate": 9.199999999999999e-08, "loss": 0.0021, "num_tokens": 76460146.0, "reward": 1.9499032497406006, "reward_std": 1.7529157400131226, "rewards/reward_func/mean": 1.9499032497406006, "rewards/reward_func/std": 1.7529157400131226, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.649365186691284, "sampling/importance_sampling_ratio/mean": 0.9902278780937195, "sampling/importance_sampling_ratio/min": 0.0009075120906345546, "sampling/sampling_logp_difference/max": 7.004803657531738, "sampling/sampling_logp_difference/mean": 0.020048227161169052, "step": 228, "step_time": 354.2017886976246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15784.0, "completions/max_terminated_length": 15784.0, "completions/mean_length": 11829.5625, "completions/mean_terminated_length": 11829.5625, "completions/min_length": 7684.0, "completions/min_terminated_length": 7684.0, "entropy": 0.26049436163157225, "epoch": 1.8617886178861789, "frac_reward_zero_std": 0.0, "grad_norm": 0.35490682721138, "learning_rate": 8.8e-08, "loss": 0.0044, "num_tokens": 76869980.0, "reward": 0.8487756252288818, "reward_std": 0.6379562020301819, "rewards/reward_func/mean": 0.8487756252288818, "rewards/reward_func/std": 0.6379562020301819, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9911534786224365, "sampling/importance_sampling_ratio/min": 0.10822553932666779, "sampling/sampling_logp_difference/max": 2.2235379219055176, "sampling/sampling_logp_difference/mean": 0.018788527697324753, "step": 229, "step_time": 401.4078663017135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18127.0, "completions/max_terminated_length": 18127.0, "completions/mean_length": 10958.9375, "completions/mean_terminated_length": 10958.9375, "completions/min_length": 6603.0, "completions/min_terminated_length": 6603.0, "entropy": 0.2978894282132387, "epoch": 1.8699186991869918, "frac_reward_zero_std": 0.0, "grad_norm": 0.3882869482040405, "learning_rate": 8.4e-08, "loss": 0.0189, "num_tokens": 77235146.0, "reward": 1.1151258945465088, "reward_std": 1.0892480611801147, "rewards/reward_func/mean": 1.1284239292144775, "rewards/reward_func/std": 1.0722978115081787, "rewards/soft_overlong_punishment_reward/mean": -0.01329803466796875, "rewards/soft_overlong_punishment_reward/std": 0.07522504031658173, "sampling/importance_sampling_ratio/max": 2.948336601257324, "sampling/importance_sampling_ratio/mean": 0.9897523522377014, "sampling/importance_sampling_ratio/min": 0.3448498249053955, "sampling/sampling_logp_difference/max": 1.0812411308288574, "sampling/sampling_logp_difference/mean": 0.021154817193746567, "step": 230, "step_time": 382.1135653653182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14865.0, "completions/max_terminated_length": 14865.0, "completions/mean_length": 10480.5, "completions/mean_terminated_length": 10480.5, "completions/min_length": 7920.0, "completions/min_terminated_length": 7920.0, "entropy": 0.26645700354129076, "epoch": 1.8780487804878048, "frac_reward_zero_std": 0.0, "grad_norm": 0.3675365149974823, "learning_rate": 8e-08, "loss": 0.0104, "num_tokens": 77595442.0, "reward": 1.146174669265747, "reward_std": 0.4796574115753174, "rewards/reward_func/mean": 1.146174669265747, "rewards/reward_func/std": 0.47965744137763977, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9905895590782166, "sampling/importance_sampling_ratio/min": 0.06983362138271332, "sampling/sampling_logp_difference/max": 2.66163969039917, "sampling/sampling_logp_difference/mean": 0.01905234530568123, "step": 231, "step_time": 409.8225920125842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13849.0, "completions/max_terminated_length": 13849.0, "completions/mean_length": 10798.125, "completions/mean_terminated_length": 10798.125, "completions/min_length": 8638.0, "completions/min_terminated_length": 8638.0, "entropy": 0.30643040873110294, "epoch": 1.886178861788618, "frac_reward_zero_std": 0.25, "grad_norm": 0.3219410479068756, "learning_rate": 7.599999999999999e-08, "loss": -0.0163, "num_tokens": 77959222.0, "reward": 0.9619793891906738, "reward_std": 0.6255706548690796, "rewards/reward_func/mean": 0.9619793891906738, "rewards/reward_func/std": 0.6255706548690796, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.219569444656372, "sampling/importance_sampling_ratio/mean": 0.9891918301582336, "sampling/importance_sampling_ratio/min": 0.1252058893442154, "sampling/sampling_logp_difference/max": 2.0777957439422607, "sampling/sampling_logp_difference/mean": 0.021789079532027245, "step": 232, "step_time": 394.94902472407557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15015.0, "completions/max_terminated_length": 15015.0, "completions/mean_length": 9467.5625, "completions/mean_terminated_length": 9467.5625, "completions/min_length": 5704.0, "completions/min_terminated_length": 5704.0, "entropy": 0.29639999382197857, "epoch": 1.8943089430894309, "frac_reward_zero_std": 0.0, "grad_norm": 0.39320817589759827, "learning_rate": 7.2e-08, "loss": 0.0284, "num_tokens": 78276592.0, "reward": 1.2161970138549805, "reward_std": 0.6799400448799133, "rewards/reward_func/mean": 1.2161970138549805, "rewards/reward_func/std": 0.6799400448799133, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0554580688476562, "sampling/importance_sampling_ratio/mean": 0.9896321296691895, "sampling/importance_sampling_ratio/min": 0.18130598962306976, "sampling/sampling_logp_difference/max": 1.7075691223144531, "sampling/sampling_logp_difference/mean": 0.020897675305604935, "step": 233, "step_time": 352.86834441358224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14053.0, "completions/max_terminated_length": 14053.0, "completions/mean_length": 8768.46875, "completions/mean_terminated_length": 8768.46875, "completions/min_length": 5297.0, "completions/min_terminated_length": 5297.0, "entropy": 0.3119491506367922, "epoch": 1.9024390243902438, "frac_reward_zero_std": 0.0, "grad_norm": 0.4400003254413605, "learning_rate": 6.8e-08, "loss": 0.0034, "num_tokens": 78571071.0, "reward": 0.8220476508140564, "reward_std": 0.7369102239608765, "rewards/reward_func/mean": 0.8220476508140564, "rewards/reward_func/std": 0.7369101643562317, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8682268857955933, "sampling/importance_sampling_ratio/mean": 0.9890620708465576, "sampling/importance_sampling_ratio/min": 0.3670034110546112, "sampling/sampling_logp_difference/max": 1.0023841857910156, "sampling/sampling_logp_difference/mean": 0.02175344154238701, "step": 234, "step_time": 319.42806904623285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14845.0, "completions/max_terminated_length": 14845.0, "completions/mean_length": 9297.9375, "completions/mean_terminated_length": 9297.9375, "completions/min_length": 6124.0, "completions/min_terminated_length": 6124.0, "entropy": 0.3066948615014553, "epoch": 1.910569105691057, "frac_reward_zero_std": 0.0, "grad_norm": 0.4045936167240143, "learning_rate": 6.4e-08, "loss": 0.0253, "num_tokens": 78882533.0, "reward": 1.8184316158294678, "reward_std": 2.019770383834839, "rewards/reward_func/mean": 1.8184316158294678, "rewards/reward_func/std": 2.019770383834839, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.011228561401367, "sampling/importance_sampling_ratio/mean": 0.9893174171447754, "sampling/importance_sampling_ratio/min": 0.18020933866500854, "sampling/sampling_logp_difference/max": 1.7136361598968506, "sampling/sampling_logp_difference/mean": 0.021685179322957993, "step": 235, "step_time": 336.20116131124087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12692.0, "completions/max_terminated_length": 12692.0, "completions/mean_length": 9302.34375, "completions/mean_terminated_length": 9302.34375, "completions/min_length": 5987.0, "completions/min_terminated_length": 5987.0, "entropy": 0.28985567949712276, "epoch": 1.91869918699187, "frac_reward_zero_std": 0.0, "grad_norm": 0.41015562415122986, "learning_rate": 6e-08, "loss": -0.0435, "num_tokens": 79190928.0, "reward": 1.1640218496322632, "reward_std": 0.5124284029006958, "rewards/reward_func/mean": 1.1640218496322632, "rewards/reward_func/std": 0.5124284029006958, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1602694988250732, "sampling/importance_sampling_ratio/mean": 0.9901226162910461, "sampling/importance_sampling_ratio/min": 0.28592556715011597, "sampling/sampling_logp_difference/max": 1.252023696899414, "sampling/sampling_logp_difference/mean": 0.020301442593336105, "step": 236, "step_time": 299.4945896314457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16871.0, "completions/max_terminated_length": 16871.0, "completions/mean_length": 11865.28125, "completions/mean_terminated_length": 11865.28125, "completions/min_length": 6579.0, "completions/min_terminated_length": 6579.0, "entropy": 0.26952589210122824, "epoch": 1.9268292682926829, "frac_reward_zero_std": 0.0, "grad_norm": 0.33614400029182434, "learning_rate": 5.6e-08, "loss": 0.0191, "num_tokens": 79590737.0, "reward": 1.8198858499526978, "reward_std": 4.182967662811279, "rewards/reward_func/mean": 1.8236013650894165, "rewards/reward_func/std": 4.181288719177246, "rewards/soft_overlong_punishment_reward/mean": -0.00371551513671875, "rewards/soft_overlong_punishment_reward/std": 0.021018126979470253, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9906592965126038, "sampling/importance_sampling_ratio/min": 0.26234784722328186, "sampling/sampling_logp_difference/max": 1.3380839824676514, "sampling/sampling_logp_difference/mean": 0.01902042329311371, "step": 237, "step_time": 441.34759285254404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15871.0, "completions/max_terminated_length": 15871.0, "completions/mean_length": 11183.9375, "completions/mean_terminated_length": 11183.9375, "completions/min_length": 6786.0, "completions/min_terminated_length": 6786.0, "entropy": 0.2905911058187485, "epoch": 1.934959349593496, "frac_reward_zero_std": 0.0, "grad_norm": 0.3801893889904022, "learning_rate": 5.1999999999999996e-08, "loss": -0.0176, "num_tokens": 79965479.0, "reward": 0.8954036235809326, "reward_std": 0.6015174388885498, "rewards/reward_func/mean": 0.8954036235809326, "rewards/reward_func/std": 0.6015174388885498, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7719638347625732, "sampling/importance_sampling_ratio/mean": 0.9899714589118958, "sampling/importance_sampling_ratio/min": 0.08788038045167923, "sampling/sampling_logp_difference/max": 2.4317786693573, "sampling/sampling_logp_difference/mean": 0.020339064300060272, "step": 238, "step_time": 364.3018519633915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15284.0, "completions/max_terminated_length": 15284.0, "completions/mean_length": 11272.84375, "completions/mean_terminated_length": 11272.84375, "completions/min_length": 7006.0, "completions/min_terminated_length": 7006.0, "entropy": 0.27012884337455034, "epoch": 1.943089430894309, "frac_reward_zero_std": 0.0, "grad_norm": 0.3917267322540283, "learning_rate": 4.8e-08, "loss": 0.0152, "num_tokens": 80347138.0, "reward": 3.6909549236297607, "reward_std": 5.090480327606201, "rewards/reward_func/mean": 3.6909549236297607, "rewards/reward_func/std": 5.090480804443359, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9906477332115173, "sampling/importance_sampling_ratio/min": 0.17718777060508728, "sampling/sampling_logp_difference/max": 1.7883415222167969, "sampling/sampling_logp_difference/mean": 0.019167255610227585, "step": 239, "step_time": 477.38329637190327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13860.0, "completions/max_terminated_length": 13860.0, "completions/mean_length": 10537.125, "completions/mean_terminated_length": 10537.125, "completions/min_length": 7223.0, "completions/min_terminated_length": 7223.0, "entropy": 0.2844161428511143, "epoch": 1.951219512195122, "frac_reward_zero_std": 0.0, "grad_norm": 0.35918739438056946, "learning_rate": 4.4e-08, "loss": -0.0147, "num_tokens": 80703366.0, "reward": 1.0506994724273682, "reward_std": 0.6146509647369385, "rewards/reward_func/mean": 1.0506994724273682, "rewards/reward_func/std": 0.6146509051322937, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.868777871131897, "sampling/importance_sampling_ratio/mean": 0.9900673627853394, "sampling/importance_sampling_ratio/min": 0.10262572020292282, "sampling/sampling_logp_difference/max": 2.2766666412353516, "sampling/sampling_logp_difference/mean": 0.01999824121594429, "step": 240, "step_time": 330.3526580410544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17687.0, "completions/max_terminated_length": 17687.0, "completions/mean_length": 11904.0, "completions/mean_terminated_length": 11904.0, "completions/min_length": 6393.0, "completions/min_terminated_length": 6393.0, "entropy": 0.2790603097528219, "epoch": 1.959349593495935, "frac_reward_zero_std": 0.0, "grad_norm": 0.37855809926986694, "learning_rate": 4e-08, "loss": 0.0527, "num_tokens": 81114366.0, "reward": 0.9515074491500854, "reward_std": 0.5839796662330627, "rewards/reward_func/mean": 0.9614485502243042, "rewards/reward_func/std": 0.588039755821228, "rewards/soft_overlong_punishment_reward/mean": -0.00994110107421875, "rewards/soft_overlong_punishment_reward/std": 0.056235358119010925, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9902161955833435, "sampling/importance_sampling_ratio/min": 0.05338960140943527, "sampling/sampling_logp_difference/max": 2.9301393032073975, "sampling/sampling_logp_difference/mean": 0.01992051862180233, "step": 241, "step_time": 570.8844402271789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16672.0, "completions/max_terminated_length": 16672.0, "completions/mean_length": 10269.0625, "completions/mean_terminated_length": 10269.0625, "completions/min_length": 4406.0, "completions/min_terminated_length": 4406.0, "entropy": 0.3053451906889677, "epoch": 1.967479674796748, "frac_reward_zero_std": 0.0, "grad_norm": 0.38739898800849915, "learning_rate": 3.6e-08, "loss": -0.0214, "num_tokens": 81456344.0, "reward": 1.5581142902374268, "reward_std": 1.299147129058838, "rewards/reward_func/mean": 1.5603115558624268, "rewards/reward_func/std": 1.298970341682434, "rewards/soft_overlong_punishment_reward/mean": -0.002197265625, "rewards/soft_overlong_punishment_reward/std": 0.012429611757397652, "sampling/importance_sampling_ratio/max": 1.9272154569625854, "sampling/importance_sampling_ratio/mean": 0.9893536567687988, "sampling/importance_sampling_ratio/min": 0.2590552270412445, "sampling/sampling_logp_difference/max": 1.3507139682769775, "sampling/sampling_logp_difference/mean": 0.021601153537631035, "step": 242, "step_time": 396.345472401008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13318.0, "completions/max_terminated_length": 13318.0, "completions/mean_length": 8007.4375, "completions/mean_terminated_length": 8007.4375, "completions/min_length": 4218.0, "completions/min_terminated_length": 4218.0, "entropy": 0.3001875299960375, "epoch": 1.975609756097561, "frac_reward_zero_std": 0.0, "grad_norm": 0.4350431263446808, "learning_rate": 3.2e-08, "loss": -0.0367, "num_tokens": 81723414.0, "reward": 1.3142027854919434, "reward_std": 0.18320664763450623, "rewards/reward_func/mean": 1.3142027854919434, "rewards/reward_func/std": 0.18320664763450623, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1279447078704834, "sampling/importance_sampling_ratio/mean": 0.9896602034568787, "sampling/importance_sampling_ratio/min": 0.34697410464286804, "sampling/sampling_logp_difference/max": 1.0585050582885742, "sampling/sampling_logp_difference/mean": 0.02109161950647831, "step": 243, "step_time": 296.47460809443146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15357.0, "completions/max_terminated_length": 15357.0, "completions/mean_length": 11084.78125, "completions/mean_terminated_length": 11084.78125, "completions/min_length": 7900.0, "completions/min_terminated_length": 7900.0, "entropy": 0.2727947048842907, "epoch": 1.9837398373983741, "frac_reward_zero_std": 0.0, "grad_norm": 0.748999297618866, "learning_rate": 2.8e-08, "loss": 0.0009, "num_tokens": 82106623.0, "reward": 1.7209361791610718, "reward_std": 1.6489285230636597, "rewards/reward_func/mean": 1.7209361791610718, "rewards/reward_func/std": 1.6489285230636597, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9903358817100525, "sampling/importance_sampling_ratio/min": 0.017168119549751282, "sampling/sampling_logp_difference/max": 4.064701080322266, "sampling/sampling_logp_difference/mean": 0.01944074034690857, "step": 244, "step_time": 410.3419267425779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14174.0, "completions/max_terminated_length": 14174.0, "completions/mean_length": 9691.53125, "completions/mean_terminated_length": 9691.53125, "completions/min_length": 6656.0, "completions/min_terminated_length": 6656.0, "entropy": 0.3040638938546181, "epoch": 1.9918699186991868, "frac_reward_zero_std": 0.0, "grad_norm": 0.3910830318927765, "learning_rate": 2.4e-08, "loss": -0.0349, "num_tokens": 82432072.0, "reward": 1.075455665588379, "reward_std": 0.8865517377853394, "rewards/reward_func/mean": 1.075455665588379, "rewards/reward_func/std": 0.8865517973899841, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2025558948516846, "sampling/importance_sampling_ratio/mean": 0.9893741011619568, "sampling/importance_sampling_ratio/min": 0.3180047869682312, "sampling/sampling_logp_difference/max": 1.145688772201538, "sampling/sampling_logp_difference/mean": 0.02134772762656212, "step": 245, "step_time": 321.8410828395281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15518.0, "completions/max_terminated_length": 15518.0, "completions/mean_length": 10144.40625, "completions/mean_terminated_length": 10144.40625, "completions/min_length": 5058.0, "completions/min_terminated_length": 5058.0, "entropy": 0.29126408137381077, "epoch": 2.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.34645918011665344, "learning_rate": 2e-08, "loss": -0.0083, "num_tokens": 82772749.0, "reward": 1.1977554559707642, "reward_std": 1.0357450246810913, "rewards/reward_func/mean": 1.1977554559707642, "rewards/reward_func/std": 1.0357449054718018, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9901622533798218, "sampling/importance_sampling_ratio/min": 0.0009917415445670485, "sampling/sampling_logp_difference/max": 6.916048049926758, "sampling/sampling_logp_difference/mean": 0.020361589267849922, "step": 246, "step_time": 377.6803415892646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16982.0, "completions/max_terminated_length": 16982.0, "completions/mean_length": 9476.9375, "completions/mean_terminated_length": 9476.9375, "completions/min_length": 3324.0, "completions/min_terminated_length": 3324.0, "entropy": 0.2707563489675522, "epoch": 2.008130081300813, "frac_reward_zero_std": 0.0, "grad_norm": 0.39088794589042664, "learning_rate": 1.6e-08, "loss": 0.0193, "num_tokens": 83098347.0, "reward": 1.1425082683563232, "reward_std": 0.49591726064682007, "rewards/reward_func/mean": 1.1470706462860107, "rewards/reward_func/std": 0.4847003221511841, "rewards/soft_overlong_punishment_reward/mean": -0.0045623779296875, "rewards/soft_overlong_punishment_reward/std": 0.025808705016970634, "sampling/importance_sampling_ratio/max": 2.0281832218170166, "sampling/importance_sampling_ratio/mean": 0.9908162355422974, "sampling/importance_sampling_ratio/min": 0.12059780210256577, "sampling/sampling_logp_difference/max": 2.1152942180633545, "sampling/sampling_logp_difference/mean": 0.018710073083639145, "step": 247, "step_time": 388.46342860814184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15937.0, "completions/max_terminated_length": 15937.0, "completions/mean_length": 10819.40625, "completions/mean_terminated_length": 10819.40625, "completions/min_length": 5389.0, "completions/min_terminated_length": 5389.0, "entropy": 0.24684153776615858, "epoch": 2.016260162601626, "frac_reward_zero_std": 0.0, "grad_norm": 0.35437819361686707, "learning_rate": 1.2e-08, "loss": -0.026, "num_tokens": 83472368.0, "reward": 0.9455545544624329, "reward_std": 0.6301490664482117, "rewards/reward_func/mean": 0.9455545544624329, "rewards/reward_func/std": 0.6301490664482117, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9916300773620605, "sampling/importance_sampling_ratio/min": 0.03407047688961029, "sampling/sampling_logp_difference/max": 3.379323959350586, "sampling/sampling_logp_difference/mean": 0.01745373196899891, "step": 248, "step_time": 433.2920012548566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12313.0, "completions/max_terminated_length": 12313.0, "completions/mean_length": 8494.34375, "completions/mean_terminated_length": 8494.34375, "completions/min_length": 5617.0, "completions/min_terminated_length": 5617.0, "entropy": 0.3050242904573679, "epoch": 2.024390243902439, "frac_reward_zero_std": 0.25, "grad_norm": 0.38736531138420105, "learning_rate": 8e-09, "loss": 0.0544, "num_tokens": 83755963.0, "reward": 1.0841058492660522, "reward_std": 0.6200099587440491, "rewards/reward_func/mean": 1.0841058492660522, "rewards/reward_func/std": 0.6200099587440491, "rewards/soft_overlong_punishment_reward/mean": 0.0, "rewards/soft_overlong_punishment_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7827914953231812, "sampling/importance_sampling_ratio/mean": 0.9893178939819336, "sampling/importance_sampling_ratio/min": 0.07585831731557846, "sampling/sampling_logp_difference/max": 2.578887939453125, "sampling/sampling_logp_difference/mean": 0.021333064883947372, "step": 249, "step_time": 296.1807095913682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16956.0, "completions/max_terminated_length": 16956.0, "completions/mean_length": 11429.84375, "completions/mean_terminated_length": 11429.84375, "completions/min_length": 7864.0, "completions/min_terminated_length": 7864.0, "entropy": 0.2977316789329052, "epoch": 2.032520325203252, "frac_reward_zero_std": 0.0, "grad_norm": 0.3691120445728302, "learning_rate": 4e-09, "loss": -0.0215, "num_tokens": 84138822.0, "reward": 0.8533658981323242, "reward_std": 0.6907368898391724, "rewards/reward_func/mean": 0.8577299118041992, "rewards/reward_func/std": 0.6934342384338379, "rewards/soft_overlong_punishment_reward/mean": -0.004364013671875, "rewards/soft_overlong_punishment_reward/std": 0.02468658797442913, "sampling/importance_sampling_ratio/max": 2.4678995609283447, "sampling/importance_sampling_ratio/mean": 0.9895965456962585, "sampling/importance_sampling_ratio/min": 0.27175045013427734, "sampling/sampling_logp_difference/max": 1.3028711080551147, "sampling/sampling_logp_difference/mean": 0.02109983190894127, "step": 250, "step_time": 396.94350118376315 } ], "logging_steps": 1, "max_steps": 250, "num_input_tokens_seen": 84138822, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }