diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.35832011699676514, + "epoch": 0.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.23671293258667, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.2758, + "num_tokens": 5417.0, + "reward": 0.4775000214576721, + "reward_std": 0.5056283473968506, + "rewards/reward_func/mean": 0.4775000214576721, + "rewards/reward_func/std": 0.5403900742530823, + "sampling/importance_sampling_ratio/max": 2.4071154594421387, + "sampling/importance_sampling_ratio/mean": 1.1429595947265625, + "sampling/importance_sampling_ratio/min": 0.5015585422515869, + "sampling/sampling_logp_difference/max": 0.5305562019348145, + "sampling/sampling_logp_difference/mean": 0.024324804544448853, + "step": 1, + "step_time": 29.307177749986295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3473261594772339, + "epoch": 0.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3394556045532227, + "kl": 0.0, + "learning_rate": 1.6666666666666668e-07, + "loss": 0.2918, + "num_tokens": 11253.0, + "reward": 0.581250011920929, + "reward_std": 0.5712425708770752, + "rewards/reward_func/mean": 0.581250011920929, + "rewards/reward_func/std": 0.5513473749160767, + "sampling/importance_sampling_ratio/max": 2.3380353450775146, + "sampling/importance_sampling_ratio/mean": 1.2109484672546387, + "sampling/importance_sampling_ratio/min": 0.4137703776359558, + "sampling/sampling_logp_difference/max": 0.6683757305145264, + "sampling/sampling_logp_difference/mean": 0.024658963084220886, + "step": 2, + "step_time": 40.91707400101586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.36352208256721497, + "epoch": 0.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7528427839279175, + "kl": 0.0018581235781311989, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0856, + "num_tokens": 16645.0, + "reward": 0.22500000894069672, + "reward_std": 0.3063344955444336, + "rewards/reward_func/mean": 0.22500000894069672, + "rewards/reward_func/std": 0.4666905105113983, + "sampling/importance_sampling_ratio/max": 1.6700822114944458, + "sampling/importance_sampling_ratio/mean": 1.325523018836975, + "sampling/importance_sampling_ratio/min": 0.6139910221099854, + "sampling/sampling_logp_difference/max": 0.3466939926147461, + "sampling/sampling_logp_difference/mean": 0.0239357128739357, + "step": 3, + "step_time": 33.77775888898759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.335945725440979, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.072298526763916, + "kl": 0.0018037607660517097, + "learning_rate": 5.000000000000001e-07, + "loss": 0.1746, + "num_tokens": 22430.0, + "reward": 0.21375000476837158, + "reward_std": 0.5115964412689209, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.47388777136802673, + "sampling/importance_sampling_ratio/max": 2.130910873413086, + "sampling/importance_sampling_ratio/mean": 0.9638596773147583, + "sampling/importance_sampling_ratio/min": 0.3092893362045288, + "sampling/sampling_logp_difference/max": 0.9354848861694336, + "sampling/sampling_logp_difference/mean": 0.022302545607089996, + "step": 4, + "step_time": 34.65735469799256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 45.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3547826111316681, + "epoch": 0.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7712143659591675, + "kl": 0.0015525126364082098, + "learning_rate": 6.666666666666667e-07, + "loss": 0.1898, + "num_tokens": 28484.0, + "reward": 0.20000000298023224, + "reward_std": 0.49363037943840027, + "rewards/reward_func/mean": 0.20000000298023224, + "rewards/reward_func/std": 0.4572901427745819, + "sampling/importance_sampling_ratio/max": 2.8111071586608887, + "sampling/importance_sampling_ratio/mean": 1.2563235759735107, + "sampling/importance_sampling_ratio/min": 0.7284324169158936, + "sampling/sampling_logp_difference/max": 0.39002323150634766, + "sampling/sampling_logp_difference/mean": 0.02487805485725403, + "step": 5, + "step_time": 39.42609074199572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.4256824254989624, + "epoch": 0.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2577799558639526, + "kl": 0.002119219396263361, + "learning_rate": 8.333333333333333e-07, + "loss": -0.1565, + "num_tokens": 33246.0, + "reward": 0.48250001668930054, + "reward_std": 0.5949929356575012, + "rewards/reward_func/mean": 0.48250001668930054, + "rewards/reward_func/std": 0.5508629679679871, + "sampling/importance_sampling_ratio/max": 1.764662742614746, + "sampling/importance_sampling_ratio/mean": 1.1115164756774902, + "sampling/importance_sampling_ratio/min": 0.4326048195362091, + "sampling/sampling_logp_difference/max": 0.35713624954223633, + "sampling/sampling_logp_difference/mean": 0.023226505145430565, + "step": 6, + "step_time": 25.11293228599243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3634033203125, + "epoch": 0.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7828105688095093, + "kl": 0.0015323495026677847, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0796, + "num_tokens": 39042.0, + "reward": 0.3125, + "reward_std": 0.5887748003005981, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5680983662605286, + "sampling/importance_sampling_ratio/max": 1.3392544984817505, + "sampling/importance_sampling_ratio/mean": 0.7953487634658813, + "sampling/importance_sampling_ratio/min": 0.4173814654350281, + "sampling/sampling_logp_difference/max": 0.29545068740844727, + "sampling/sampling_logp_difference/mean": 0.025280017405748367, + "step": 7, + "step_time": 47.24140843501664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 43.375, + "completions/mean_terminated_length": 43.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3641508221626282, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1968954801559448, + "kl": 0.0013946478720754385, + "learning_rate": 1.1666666666666668e-06, + "loss": -0.1307, + "num_tokens": 44922.0, + "reward": 0.22875000536441803, + "reward_std": 0.2921527922153473, + "rewards/reward_func/mean": 0.22875000536441803, + "rewards/reward_func/std": 0.4607583284378052, + "sampling/importance_sampling_ratio/max": 1.5681222677230835, + "sampling/importance_sampling_ratio/mean": 1.014966368675232, + "sampling/importance_sampling_ratio/min": 0.7567934393882751, + "sampling/sampling_logp_difference/max": 0.34651947021484375, + "sampling/sampling_logp_difference/mean": 0.01997371017932892, + "step": 8, + "step_time": 34.33773313398706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 55.625, + "completions/mean_terminated_length": 55.625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.35281139612197876, + "epoch": 0.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8254293203353882, + "kl": 0.002236333442851901, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0283, + "num_tokens": 50617.0, + "reward": 0.46875, + "reward_std": 0.5300248861312866, + "rewards/reward_func/mean": 0.46875, + "rewards/reward_func/std": 0.5659489631652832, + "sampling/importance_sampling_ratio/max": 1.2048767805099487, + "sampling/importance_sampling_ratio/mean": 0.7666900157928467, + "sampling/importance_sampling_ratio/min": 0.39571237564086914, + "sampling/sampling_logp_difference/max": 0.35016971826553345, + "sampling/sampling_logp_difference/mean": 0.025727007538080215, + "step": 9, + "step_time": 34.18884765400435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3982703983783722, + "epoch": 0.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.185789942741394, + "kl": 0.001458184327930212, + "learning_rate": 1.5e-06, + "loss": 0.0761, + "num_tokens": 56268.0, + "reward": 0.07750000059604645, + "reward_std": 0.28465136885643005, + "rewards/reward_func/mean": 0.07750000059604645, + "rewards/reward_func/std": 0.3708580732345581, + "sampling/importance_sampling_ratio/max": 2.0030765533447266, + "sampling/importance_sampling_ratio/mean": 0.9082742929458618, + "sampling/importance_sampling_ratio/min": 0.42338261008262634, + "sampling/sampling_logp_difference/max": 0.4783933162689209, + "sampling/sampling_logp_difference/mean": 0.023844268172979355, + "step": 10, + "step_time": 39.159437736991094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 54.875, + "completions/mean_terminated_length": 54.875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3613673448562622, + "epoch": 0.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8812184929847717, + "kl": 0.0014497374650090933, + "learning_rate": 1.6666666666666667e-06, + "loss": -0.0696, + "num_tokens": 62534.0, + "reward": 0.3400000035762787, + "reward_std": 0.27956950664520264, + "rewards/reward_func/mean": 0.3400000035762787, + "rewards/reward_func/std": 0.543007493019104, + "sampling/importance_sampling_ratio/max": 1.5621131658554077, + "sampling/importance_sampling_ratio/mean": 0.8559645414352417, + "sampling/importance_sampling_ratio/min": 0.45671403408050537, + "sampling/sampling_logp_difference/max": 0.3955717086791992, + "sampling/sampling_logp_difference/mean": 0.02080589532852173, + "step": 11, + "step_time": 39.89227997799753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.39176082611083984, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1600431203842163, + "kl": 0.0020833718590438366, + "learning_rate": 1.8333333333333333e-06, + "loss": -0.0961, + "num_tokens": 68151.0, + "reward": 0.32374998927116394, + "reward_std": 0.5406870245933533, + "rewards/reward_func/mean": 0.32374998927116394, + "rewards/reward_func/std": 0.5189808011054993, + "sampling/importance_sampling_ratio/max": 2.046029806137085, + "sampling/importance_sampling_ratio/mean": 1.0404480695724487, + "sampling/importance_sampling_ratio/min": 0.48177048563957214, + "sampling/sampling_logp_difference/max": 0.2973281145095825, + "sampling/sampling_logp_difference/mean": 0.024639006704092026, + "step": 12, + "step_time": 48.536910057999194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 43.375, + "completions/mean_terminated_length": 43.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3269670605659485, + "epoch": 0.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8620632886886597, + "kl": 0.001273418078199029, + "learning_rate": 2.0000000000000003e-06, + "loss": -0.081, + "num_tokens": 73963.0, + "reward": 0.3425000011920929, + "reward_std": 0.5563790202140808, + "rewards/reward_func/mean": 0.3425000011920929, + "rewards/reward_func/std": 0.5344623923301697, + "sampling/importance_sampling_ratio/max": 1.4767922163009644, + "sampling/importance_sampling_ratio/mean": 0.8396698236465454, + "sampling/importance_sampling_ratio/min": 0.5644444823265076, + "sampling/sampling_logp_difference/max": 0.2883424758911133, + "sampling/sampling_logp_difference/mean": 0.024868279695510864, + "step": 13, + "step_time": 33.96547721300158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.31234925985336304, + "epoch": 0.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1137773990631104, + "kl": 0.0012005593162029982, + "learning_rate": 2.166666666666667e-06, + "loss": 0.3334, + "num_tokens": 78838.0, + "reward": 0.29625001549720764, + "reward_std": 0.6014425754547119, + "rewards/reward_func/mean": 0.29625001549720764, + "rewards/reward_func/std": 0.5761184692382812, + "sampling/importance_sampling_ratio/max": 2.09089994430542, + "sampling/importance_sampling_ratio/mean": 1.2477295398712158, + "sampling/importance_sampling_ratio/min": 0.702942430973053, + "sampling/sampling_logp_difference/max": 0.46815013885498047, + "sampling/sampling_logp_difference/mean": 0.019913293421268463, + "step": 14, + "step_time": 33.421214936010074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3603006601333618, + "epoch": 0.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8420554399490356, + "kl": 0.001688068499788642, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.3399, + "num_tokens": 84243.0, + "reward": 0.07874999940395355, + "reward_std": 0.2735734283924103, + "rewards/reward_func/mean": 0.07874999940395355, + "rewards/reward_func/std": 0.3578681945800781, + "sampling/importance_sampling_ratio/max": 2.986236095428467, + "sampling/importance_sampling_ratio/mean": 1.2305893898010254, + "sampling/importance_sampling_ratio/min": 0.7438207864761353, + "sampling/sampling_logp_difference/max": 0.5467426776885986, + "sampling/sampling_logp_difference/mean": 0.024384144693613052, + "step": 15, + "step_time": 35.22606979601551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 45.0, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3401448726654053, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2467572689056396, + "kl": 0.0013698764378204942, + "learning_rate": 2.5e-06, + "loss": 0.0465, + "num_tokens": 89603.0, + "reward": 0.1887499988079071, + "reward_std": 0.33193475008010864, + "rewards/reward_func/mean": 0.1887499988079071, + "rewards/reward_func/std": 0.48774808645248413, + "sampling/importance_sampling_ratio/max": 1.0488877296447754, + "sampling/importance_sampling_ratio/mean": 0.8098611831665039, + "sampling/importance_sampling_ratio/min": 0.5529040694236755, + "sampling/sampling_logp_difference/max": 0.4784054756164551, + "sampling/sampling_logp_difference/mean": 0.021436292678117752, + "step": 16, + "step_time": 40.98757715098327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3725942373275757, + "epoch": 0.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.417001724243164, + "kl": 0.0016124111134558916, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0637, + "num_tokens": 94945.0, + "reward": 0.05000000447034836, + "reward_std": 0.2862437069416046, + "rewards/reward_func/mean": 0.05000000447034836, + "rewards/reward_func/std": 0.38652294874191284, + "sampling/importance_sampling_ratio/max": 1.8293613195419312, + "sampling/importance_sampling_ratio/mean": 1.3590400218963623, + "sampling/importance_sampling_ratio/min": 0.8256513476371765, + "sampling/sampling_logp_difference/max": 0.3571474552154541, + "sampling/sampling_logp_difference/mean": 0.020312845706939697, + "step": 17, + "step_time": 37.4742742870003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3666185140609741, + "epoch": 0.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.168238639831543, + "kl": 0.001655534841120243, + "learning_rate": 2.8333333333333335e-06, + "loss": -0.1929, + "num_tokens": 100888.0, + "reward": 0.20124998688697815, + "reward_std": 0.5236045122146606, + "rewards/reward_func/mean": 0.20124998688697815, + "rewards/reward_func/std": 0.48489874601364136, + "sampling/importance_sampling_ratio/max": 1.673153281211853, + "sampling/importance_sampling_ratio/mean": 1.0230400562286377, + "sampling/importance_sampling_ratio/min": 0.5740097165107727, + "sampling/sampling_logp_difference/max": 0.27298808097839355, + "sampling/sampling_logp_difference/mean": 0.02411050722002983, + "step": 18, + "step_time": 36.143502769002225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.37803328037261963, + "epoch": 0.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9871008396148682, + "kl": 0.002036505378782749, + "learning_rate": 3e-06, + "loss": -0.0558, + "num_tokens": 106144.0, + "reward": 0.35625001788139343, + "reward_std": 0.5298318266868591, + "rewards/reward_func/mean": 0.35625001788139343, + "rewards/reward_func/std": 0.5088625550270081, + "sampling/importance_sampling_ratio/max": 1.4062168598175049, + "sampling/importance_sampling_ratio/mean": 0.9718549251556396, + "sampling/importance_sampling_ratio/min": 0.3938085734844208, + "sampling/sampling_logp_difference/max": 0.3405449390411377, + "sampling/sampling_logp_difference/mean": 0.02122277393937111, + "step": 19, + "step_time": 39.01482636100263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.4097254276275635, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3784204721450806, + "kl": 0.0016567106358706951, + "learning_rate": 3.1666666666666667e-06, + "loss": 0.0886, + "num_tokens": 112107.0, + "reward": -0.057499997317790985, + "reward_std": 0.044269345700740814, + "rewards/reward_func/mean": -0.057499997317790985, + "rewards/reward_func/std": 0.04166190326213837, + "sampling/importance_sampling_ratio/max": 2.0899689197540283, + "sampling/importance_sampling_ratio/mean": 1.132345199584961, + "sampling/importance_sampling_ratio/min": 0.4411206543445587, + "sampling/sampling_logp_difference/max": 0.5205492973327637, + "sampling/sampling_logp_difference/mean": 0.027103282511234283, + "step": 20, + "step_time": 45.0223436219967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.36454537510871887, + "epoch": 0.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7992667555809021, + "kl": 0.0020659861620515585, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1669, + "num_tokens": 117999.0, + "reward": 0.32624998688697815, + "reward_std": 0.5471616387367249, + "rewards/reward_func/mean": 0.32624998688697815, + "rewards/reward_func/std": 0.5324857831001282, + "sampling/importance_sampling_ratio/max": 1.8343143463134766, + "sampling/importance_sampling_ratio/mean": 0.8793189525604248, + "sampling/importance_sampling_ratio/min": 0.3384288549423218, + "sampling/sampling_logp_difference/max": 0.4840106964111328, + "sampling/sampling_logp_difference/mean": 0.02400803565979004, + "step": 21, + "step_time": 40.175860888994066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.40864837169647217, + "epoch": 0.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.843110978603363, + "kl": 0.001421776250936091, + "learning_rate": 3.5e-06, + "loss": -0.0948, + "num_tokens": 123708.0, + "reward": 0.4725000262260437, + "reward_std": 0.5133668184280396, + "rewards/reward_func/mean": 0.4725000262260437, + "rewards/reward_func/std": 0.549278736114502, + "sampling/importance_sampling_ratio/max": 1.4361987113952637, + "sampling/importance_sampling_ratio/mean": 0.8868111371994019, + "sampling/importance_sampling_ratio/min": 0.42872440814971924, + "sampling/sampling_logp_difference/max": 0.33927369117736816, + "sampling/sampling_logp_difference/mean": 0.02582230418920517, + "step": 22, + "step_time": 168.2754703540122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.32568132877349854, + "epoch": 0.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.166347861289978, + "kl": 0.0016544836107641459, + "learning_rate": 3.6666666666666666e-06, + "loss": -0.0458, + "num_tokens": 129149.0, + "reward": 0.19249999523162842, + "reward_std": 0.5302917957305908, + "rewards/reward_func/mean": 0.19249999523162842, + "rewards/reward_func/std": 0.4909684658050537, + "sampling/importance_sampling_ratio/max": 1.6518144607543945, + "sampling/importance_sampling_ratio/mean": 0.894943118095398, + "sampling/importance_sampling_ratio/min": 0.5825864672660828, + "sampling/sampling_logp_difference/max": 0.48093175888061523, + "sampling/sampling_logp_difference/mean": 0.02260264754295349, + "step": 23, + "step_time": 125.51062903201091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3474411070346832, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0562515258789062, + "kl": 0.001693481463007629, + "learning_rate": 3.833333333333334e-06, + "loss": 0.2041, + "num_tokens": 134846.0, + "reward": 0.08125000447034836, + "reward_std": 0.2956419289112091, + "rewards/reward_func/mean": 0.08125000447034836, + "rewards/reward_func/std": 0.3755924105644226, + "sampling/importance_sampling_ratio/max": 2.1531643867492676, + "sampling/importance_sampling_ratio/mean": 1.043798565864563, + "sampling/importance_sampling_ratio/min": 0.529705822467804, + "sampling/sampling_logp_difference/max": 0.34720849990844727, + "sampling/sampling_logp_difference/mean": 0.01930052787065506, + "step": 24, + "step_time": 167.04036519900546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.3871381878852844, + "epoch": 0.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3111368417739868, + "kl": 0.0014404752291738987, + "learning_rate": 4.000000000000001e-06, + "loss": -0.0471, + "num_tokens": 140378.0, + "reward": 0.36124998331069946, + "reward_std": 0.5499054193496704, + "rewards/reward_func/mean": 0.36124998331069946, + "rewards/reward_func/std": 0.5263469219207764, + "sampling/importance_sampling_ratio/max": 1.4156557321548462, + "sampling/importance_sampling_ratio/mean": 1.1120198965072632, + "sampling/importance_sampling_ratio/min": 0.7486764788627625, + "sampling/sampling_logp_difference/max": 0.48737621307373047, + "sampling/sampling_logp_difference/mean": 0.023105096071958542, + "step": 25, + "step_time": 120.95955076700193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.39792579412460327, + "epoch": 0.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.396022915840149, + "kl": 0.0015975476708263159, + "learning_rate": 4.166666666666667e-06, + "loss": -0.0444, + "num_tokens": 146478.0, + "reward": 0.2224999964237213, + "reward_std": 0.31392672657966614, + "rewards/reward_func/mean": 0.2224999964237213, + "rewards/reward_func/std": 0.4807955026626587, + "sampling/importance_sampling_ratio/max": 1.508078932762146, + "sampling/importance_sampling_ratio/mean": 1.0499423742294312, + "sampling/importance_sampling_ratio/min": 0.5942177772521973, + "sampling/sampling_logp_difference/max": 0.3570747375488281, + "sampling/sampling_logp_difference/mean": 0.024486597627401352, + "step": 26, + "step_time": 104.28418873299961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3486085534095764, + "epoch": 0.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1279908418655396, + "kl": 0.0019664729479700327, + "learning_rate": 4.333333333333334e-06, + "loss": -0.1021, + "num_tokens": 151586.0, + "reward": 0.3425000011920929, + "reward_std": 0.2686923146247864, + "rewards/reward_func/mean": 0.3425000011920929, + "rewards/reward_func/std": 0.5363301634788513, + "sampling/importance_sampling_ratio/max": 1.8593271970748901, + "sampling/importance_sampling_ratio/mean": 1.1785297393798828, + "sampling/importance_sampling_ratio/min": 0.5566311478614807, + "sampling/sampling_logp_difference/max": 0.4686328172683716, + "sampling/sampling_logp_difference/mean": 0.022173412144184113, + "step": 27, + "step_time": 85.11922752400278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.37405920028686523, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8106129765510559, + "kl": 0.0015516983112320304, + "learning_rate": 4.5e-06, + "loss": 0.0532, + "num_tokens": 157108.0, + "reward": 0.32375001907348633, + "reward_std": 0.5761679410934448, + "rewards/reward_func/mean": 0.32375001907348633, + "rewards/reward_func/std": 0.5525767803192139, + "sampling/importance_sampling_ratio/max": 1.2767354249954224, + "sampling/importance_sampling_ratio/mean": 0.8917201161384583, + "sampling/importance_sampling_ratio/min": 0.5755601525306702, + "sampling/sampling_logp_difference/max": 0.4100228548049927, + "sampling/sampling_logp_difference/mean": 0.021200813353061676, + "step": 28, + "step_time": 109.97910062700976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3767409026622772, + "epoch": 0.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2156670093536377, + "kl": 0.0015971511602401733, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0163, + "num_tokens": 163323.0, + "reward": 0.08124999701976776, + "reward_std": 0.2750605642795563, + "rewards/reward_func/mean": 0.08124999701976776, + "rewards/reward_func/std": 0.35746878385543823, + "sampling/importance_sampling_ratio/max": 1.3450591564178467, + "sampling/importance_sampling_ratio/mean": 1.031332015991211, + "sampling/importance_sampling_ratio/min": 0.5739972591400146, + "sampling/sampling_logp_difference/max": 0.20969057083129883, + "sampling/sampling_logp_difference/mean": 0.018845085054636, + "step": 29, + "step_time": 125.90746463602409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3361830711364746, + "epoch": 0.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.085590362548828, + "kl": 0.002051064744591713, + "learning_rate": 4.833333333333333e-06, + "loss": -0.1044, + "num_tokens": 169007.0, + "reward": 0.11124999821186066, + "reward_std": 0.2608071267604828, + "rewards/reward_func/mean": 0.11124999821186066, + "rewards/reward_func/std": 0.3598586320877075, + "sampling/importance_sampling_ratio/max": 1.6862311363220215, + "sampling/importance_sampling_ratio/mean": 0.9613958597183228, + "sampling/importance_sampling_ratio/min": 0.4625941514968872, + "sampling/sampling_logp_difference/max": 0.6341955661773682, + "sampling/sampling_logp_difference/mean": 0.023160353302955627, + "step": 30, + "step_time": 116.55447733099572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.34975507855415344, + "epoch": 0.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7752349376678467, + "kl": 0.0018625394441187382, + "learning_rate": 5e-06, + "loss": -0.0863, + "num_tokens": 175255.0, + "reward": 0.05000000447034836, + "reward_std": 0.28021717071533203, + "rewards/reward_func/mean": 0.05000000447034836, + "rewards/reward_func/std": 0.3846333920955658, + "sampling/importance_sampling_ratio/max": 1.1800814867019653, + "sampling/importance_sampling_ratio/mean": 0.7340657711029053, + "sampling/importance_sampling_ratio/min": 0.3828251361846924, + "sampling/sampling_logp_difference/max": 0.574752688407898, + "sampling/sampling_logp_difference/mean": 0.022265031933784485, + "step": 31, + "step_time": 125.58116400899598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.33510199189186096, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3970941305160522, + "kl": 0.002852437552064657, + "learning_rate": 4.99998688809149e-06, + "loss": 0.0298, + "num_tokens": 180203.0, + "reward": 0.21125000715255737, + "reward_std": 0.5205552577972412, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.48203253746032715, + "sampling/importance_sampling_ratio/max": 1.4200493097305298, + "sampling/importance_sampling_ratio/mean": 0.8962746858596802, + "sampling/importance_sampling_ratio/min": 0.506354570388794, + "sampling/sampling_logp_difference/max": 0.3653395175933838, + "sampling/sampling_logp_difference/mean": 0.018153443932533264, + "step": 32, + "step_time": 99.66716593201272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.359342485666275, + "epoch": 0.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8919492363929749, + "kl": 0.0016850470565259457, + "learning_rate": 4.9999475525034974e-06, + "loss": -0.0118, + "num_tokens": 185921.0, + "reward": 0.1887499988079071, + "reward_std": 0.5218685865402222, + "rewards/reward_func/mean": 0.1887499988079071, + "rewards/reward_func/std": 0.4834529757499695, + "sampling/importance_sampling_ratio/max": 1.193926453590393, + "sampling/importance_sampling_ratio/mean": 0.886371374130249, + "sampling/importance_sampling_ratio/min": 0.6291231513023376, + "sampling/sampling_logp_difference/max": 0.6103978157043457, + "sampling/sampling_logp_difference/mean": 0.022488413378596306, + "step": 33, + "step_time": 107.37048736499855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3850080370903015, + "epoch": 0.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.457120180130005, + "kl": 0.002752592321485281, + "learning_rate": 4.999881993648633e-06, + "loss": -0.1605, + "num_tokens": 191340.0, + "reward": 0.33250001072883606, + "reward_std": 0.5519619584083557, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.5346227288246155, + "sampling/importance_sampling_ratio/max": 2.824227809906006, + "sampling/importance_sampling_ratio/mean": 1.2650679349899292, + "sampling/importance_sampling_ratio/min": 0.5782744288444519, + "sampling/sampling_logp_difference/max": 0.5304313898086548, + "sampling/sampling_logp_difference/mean": 0.026629671454429626, + "step": 34, + "step_time": 124.5775852559891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3793644905090332, + "epoch": 0.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1844000816345215, + "kl": 0.0018589177634567022, + "learning_rate": 4.99979021221458e-06, + "loss": 0.1138, + "num_tokens": 197242.0, + "reward": 0.20374999940395355, + "reward_std": 0.3106112480163574, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.48269888758659363, + "sampling/importance_sampling_ratio/max": 1.8714232444763184, + "sampling/importance_sampling_ratio/mean": 0.8821603059768677, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7558255195617676, + "sampling/sampling_logp_difference/mean": 0.02827462926506996, + "step": 35, + "step_time": 124.59522388697951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3386607766151428, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0467987060546875, + "kl": 0.002486670855432749, + "learning_rate": 4.9996722091640805e-06, + "loss": -0.0844, + "num_tokens": 202103.0, + "reward": 0.7137500047683716, + "reward_std": 0.31673291325569153, + "rewards/reward_func/mean": 0.7137500047683716, + "rewards/reward_func/std": 0.4965578019618988, + "sampling/importance_sampling_ratio/max": 1.1834334135055542, + "sampling/importance_sampling_ratio/mean": 0.8062876462936401, + "sampling/importance_sampling_ratio/min": 0.3481108844280243, + "sampling/sampling_logp_difference/max": 0.5823209285736084, + "sampling/sampling_logp_difference/mean": 0.027472082525491714, + "step": 36, + "step_time": 95.7645532739989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 55.75, + "completions/mean_terminated_length": 55.75, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.35707682371139526, + "epoch": 0.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8499080538749695, + "kl": 0.00230376492254436, + "learning_rate": 4.999527985734932e-06, + "loss": 0.0658, + "num_tokens": 207849.0, + "reward": 0.3112500011920929, + "reward_std": 0.5869807004928589, + "rewards/reward_func/mean": 0.3112500011920929, + "rewards/reward_func/std": 0.5547313094139099, + "sampling/importance_sampling_ratio/max": 1.3937541246414185, + "sampling/importance_sampling_ratio/mean": 0.9204949140548706, + "sampling/importance_sampling_ratio/min": 0.5516513586044312, + "sampling/sampling_logp_difference/max": 0.340686559677124, + "sampling/sampling_logp_difference/mean": 0.02302435413002968, + "step": 37, + "step_time": 84.15662719498505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3510167896747589, + "epoch": 0.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4321602582931519, + "kl": 0.002203675452619791, + "learning_rate": 4.999357543439969e-06, + "loss": -0.251, + "num_tokens": 213602.0, + "reward": 0.3187499940395355, + "reward_std": 0.5740761756896973, + "rewards/reward_func/mean": 0.3187499940395355, + "rewards/reward_func/std": 0.5478904247283936, + "sampling/importance_sampling_ratio/max": 1.6841275691986084, + "sampling/importance_sampling_ratio/mean": 0.862945556640625, + "sampling/importance_sampling_ratio/min": 0.3341965675354004, + "sampling/sampling_logp_difference/max": 0.4191019535064697, + "sampling/sampling_logp_difference/mean": 0.023331163451075554, + "step": 38, + "step_time": 96.54629640298663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3528852164745331, + "epoch": 0.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0844610929489136, + "kl": 0.0035500035155564547, + "learning_rate": 4.999160884067051e-06, + "loss": 0.0473, + "num_tokens": 219224.0, + "reward": 0.08124999701976776, + "reward_std": 0.27454516291618347, + "rewards/reward_func/mean": 0.08124999701976776, + "rewards/reward_func/std": 0.3577883243560791, + "sampling/importance_sampling_ratio/max": 1.6339404582977295, + "sampling/importance_sampling_ratio/mean": 0.916239857673645, + "sampling/importance_sampling_ratio/min": 0.5048863291740417, + "sampling/sampling_logp_difference/max": 0.4355291724205017, + "sampling/sampling_logp_difference/mean": 0.02792040817439556, + "step": 39, + "step_time": 90.11714809801197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35738155245780945, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9739111065864563, + "kl": 0.010244293138384819, + "learning_rate": 4.9989380096790416e-06, + "loss": 0.0651, + "num_tokens": 225224.0, + "reward": 0.057500001043081284, + "reward_std": 0.262703001499176, + "rewards/reward_func/mean": 0.057500001043081284, + "rewards/reward_func/std": 0.32779568433761597, + "sampling/importance_sampling_ratio/max": 0.9581937193870544, + "sampling/importance_sampling_ratio/mean": 0.7411354184150696, + "sampling/importance_sampling_ratio/min": 0.6077343821525574, + "sampling/sampling_logp_difference/max": 0.4662892818450928, + "sampling/sampling_logp_difference/mean": 0.027016079053282738, + "step": 40, + "step_time": 115.12962955801049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.358026921749115, + "epoch": 0.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9497724175453186, + "kl": 0.0010761432349681854, + "learning_rate": 4.998688922613788e-06, + "loss": -0.1294, + "num_tokens": 230706.0, + "reward": 0.09750000387430191, + "reward_std": 0.268343985080719, + "rewards/reward_func/mean": 0.09750000387430191, + "rewards/reward_func/std": 0.36311155557632446, + "sampling/importance_sampling_ratio/max": 1.3778512477874756, + "sampling/importance_sampling_ratio/mean": 0.8394644260406494, + "sampling/importance_sampling_ratio/min": 0.5254734754562378, + "sampling/sampling_logp_difference/max": 0.34768080711364746, + "sampling/sampling_logp_difference/mean": 0.02200084924697876, + "step": 41, + "step_time": 105.99199089498143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3547409176826477, + "epoch": 0.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8008268475532532, + "kl": 0.002836492843925953, + "learning_rate": 4.998413625484095e-06, + "loss": 0.0157, + "num_tokens": 235797.0, + "reward": 0.1837500035762787, + "reward_std": 0.49929410219192505, + "rewards/reward_func/mean": 0.1837500035762787, + "rewards/reward_func/std": 0.46315494179725647, + "sampling/importance_sampling_ratio/max": 1.5412955284118652, + "sampling/importance_sampling_ratio/mean": 0.9024899005889893, + "sampling/importance_sampling_ratio/min": 0.4405742287635803, + "sampling/sampling_logp_difference/max": 0.32985711097717285, + "sampling/sampling_logp_difference/mean": 0.022187065333127975, + "step": 42, + "step_time": 94.88906268199207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.34511110186576843, + "epoch": 0.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.284348964691162, + "kl": 0.006113796960562468, + "learning_rate": 4.9981121211777e-06, + "loss": 0.2878, + "num_tokens": 242012.0, + "reward": 0.33125001192092896, + "reward_std": 0.27656540274620056, + "rewards/reward_func/mean": 0.33125001192092896, + "rewards/reward_func/std": 0.5243346095085144, + "sampling/importance_sampling_ratio/max": 2.3226945400238037, + "sampling/importance_sampling_ratio/mean": 0.8612687587738037, + "sampling/importance_sampling_ratio/min": 0.3401707410812378, + "sampling/sampling_logp_difference/max": 0.6737399101257324, + "sampling/sampling_logp_difference/mean": 0.02680300548672676, + "step": 43, + "step_time": 83.10613166898838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.35635292530059814, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8651462197303772, + "kl": 0.0018102331086993217, + "learning_rate": 4.997784412857239e-06, + "loss": 0.1933, + "num_tokens": 248290.0, + "reward": 0.08749999105930328, + "reward_std": 0.26158273220062256, + "rewards/reward_func/mean": 0.08749999105930328, + "rewards/reward_func/std": 0.3502958118915558, + "sampling/importance_sampling_ratio/max": 1.2505582571029663, + "sampling/importance_sampling_ratio/mean": 0.8515357971191406, + "sampling/importance_sampling_ratio/min": 0.3733709156513214, + "sampling/sampling_logp_difference/max": 0.3616971969604492, + "sampling/sampling_logp_difference/mean": 0.023276425898075104, + "step": 44, + "step_time": 96.6780000999861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 57.875, + "completions/mean_terminated_length": 57.875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.4258785843849182, + "epoch": 0.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7754176259040833, + "kl": 0.005309364292770624, + "learning_rate": 4.99743050396022e-06, + "loss": 0.0623, + "num_tokens": 253843.0, + "reward": 0.3537500202655792, + "reward_std": 0.5509142875671387, + "rewards/reward_func/mean": 0.3537500202655792, + "rewards/reward_func/std": 0.5298500657081604, + "sampling/importance_sampling_ratio/max": 1.3845423460006714, + "sampling/importance_sampling_ratio/mean": 0.8699455261230469, + "sampling/importance_sampling_ratio/min": 0.30967551469802856, + "sampling/sampling_logp_difference/max": 0.4125208854675293, + "sampling/sampling_logp_difference/mean": 0.02934259921312332, + "step": 45, + "step_time": 78.66523612500168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.37021562457084656, + "epoch": 0.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3564432859420776, + "kl": 0.002915932796895504, + "learning_rate": 4.997050398198977e-06, + "loss": 0.0626, + "num_tokens": 258896.0, + "reward": 0.33000001311302185, + "reward_std": 0.5612866878509521, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5336666107177734, + "sampling/importance_sampling_ratio/max": 1.5622109174728394, + "sampling/importance_sampling_ratio/mean": 1.0067038536071777, + "sampling/importance_sampling_ratio/min": 0.41681596636772156, + "sampling/sampling_logp_difference/max": 0.5778782367706299, + "sampling/sampling_logp_difference/mean": 0.024849699810147285, + "step": 46, + "step_time": 91.13322191301268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3771839439868927, + "epoch": 0.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5632238388061523, + "kl": 0.0020723105408251286, + "learning_rate": 4.9966440995606415e-06, + "loss": -0.192, + "num_tokens": 264298.0, + "reward": 0.20499999821186066, + "reward_std": 0.32124459743499756, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.4859159588813782, + "sampling/importance_sampling_ratio/max": 2.447948694229126, + "sampling/importance_sampling_ratio/mean": 1.2227914333343506, + "sampling/importance_sampling_ratio/min": 0.46755385398864746, + "sampling/sampling_logp_difference/max": 0.3898049592971802, + "sampling/sampling_logp_difference/mean": 0.023719076067209244, + "step": 47, + "step_time": 77.2298025219934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 41.375, + "completions/mean_terminated_length": 41.375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3675180673599243, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.202669620513916, + "kl": 0.003329810919240117, + "learning_rate": 4.9962116123070925e-06, + "loss": 0.1727, + "num_tokens": 269970.0, + "reward": 0.3412500023841858, + "reward_std": 0.5402647256851196, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.5210549235343933, + "sampling/importance_sampling_ratio/max": 2.6151790618896484, + "sampling/importance_sampling_ratio/mean": 0.9013060331344604, + "sampling/importance_sampling_ratio/min": 0.20561860501766205, + "sampling/sampling_logp_difference/max": 0.5823161602020264, + "sampling/sampling_logp_difference/mean": 0.03182598948478699, + "step": 48, + "step_time": 103.054377449007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3362388014793396, + "epoch": 0.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9811183214187622, + "kl": 0.001854179659858346, + "learning_rate": 4.9957529409749185e-06, + "loss": -0.0427, + "num_tokens": 275532.0, + "reward": 0.2212499976158142, + "reward_std": 0.5194467306137085, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.48111292719841003, + "sampling/importance_sampling_ratio/max": 1.1795125007629395, + "sampling/importance_sampling_ratio/mean": 0.8117722868919373, + "sampling/importance_sampling_ratio/min": 0.32398006319999695, + "sampling/sampling_logp_difference/max": 0.3128964900970459, + "sampling/sampling_logp_difference/mean": 0.020228173583745956, + "step": 49, + "step_time": 92.03725726599805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.33680054545402527, + "epoch": 0.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8777604699134827, + "kl": 0.0024415755178779364, + "learning_rate": 4.995268090375362e-06, + "loss": 0.05, + "num_tokens": 281466.0, + "reward": 0.0637500062584877, + "reward_std": 0.27558743953704834, + "rewards/reward_func/mean": 0.0637500062584877, + "rewards/reward_func/std": 0.3678484857082367, + "sampling/importance_sampling_ratio/max": 1.50518000125885, + "sampling/importance_sampling_ratio/mean": 0.9082848429679871, + "sampling/importance_sampling_ratio/min": 0.41154056787490845, + "sampling/sampling_logp_difference/max": 0.5747478008270264, + "sampling/sampling_logp_difference/mean": 0.024731453508138657, + "step": 50, + "step_time": 101.29344615599257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.41207462549209595, + "epoch": 0.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1961466073989868, + "kl": 0.0038733931723982096, + "learning_rate": 4.99475706559428e-06, + "loss": 0.0661, + "num_tokens": 286937.0, + "reward": 0.45500001311302185, + "reward_std": 0.5954470634460449, + "rewards/reward_func/mean": 0.45500001311302185, + "rewards/reward_func/std": 0.5518540143966675, + "sampling/importance_sampling_ratio/max": 1.4747618436813354, + "sampling/importance_sampling_ratio/mean": 0.934749960899353, + "sampling/importance_sampling_ratio/min": 0.4552203118801117, + "sampling/sampling_logp_difference/max": 0.3542771339416504, + "sampling/sampling_logp_difference/mean": 0.023817723616957664, + "step": 51, + "step_time": 98.44445793100749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.31511616706848145, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.32075035572052, + "kl": 0.0016216668300330639, + "learning_rate": 4.994219871992077e-06, + "loss": -0.0975, + "num_tokens": 292284.0, + "reward": 0.6025000214576721, + "reward_std": 0.2593764066696167, + "rewards/reward_func/mean": 0.6025000214576721, + "rewards/reward_func/std": 0.5323197245597839, + "sampling/importance_sampling_ratio/max": 2.0314249992370605, + "sampling/importance_sampling_ratio/mean": 1.274023413658142, + "sampling/importance_sampling_ratio/min": 0.5603557229042053, + "sampling/sampling_logp_difference/max": 0.7182197570800781, + "sampling/sampling_logp_difference/mean": 0.021842751652002335, + "step": 52, + "step_time": 74.32618405998801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 57.875, + "completions/mean_terminated_length": 57.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.37222176790237427, + "epoch": 0.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8657234311103821, + "kl": 0.004503990523517132, + "learning_rate": 4.993656515203662e-06, + "loss": -0.0014, + "num_tokens": 298374.0, + "reward": 0.17249998450279236, + "reward_std": 0.3328244686126709, + "rewards/reward_func/mean": 0.17249998450279236, + "rewards/reward_func/std": 0.4848784804344177, + "sampling/importance_sampling_ratio/max": 1.6778643131256104, + "sampling/importance_sampling_ratio/mean": 0.9224530458450317, + "sampling/importance_sampling_ratio/min": 0.3226885199546814, + "sampling/sampling_logp_difference/max": 1.1129628419876099, + "sampling/sampling_logp_difference/mean": 0.023930778726935387, + "step": 53, + "step_time": 105.75805833100458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.38617652654647827, + "epoch": 0.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9701952934265137, + "kl": 0.002225311938673258, + "learning_rate": 4.99306700113838e-06, + "loss": -0.0465, + "num_tokens": 303786.0, + "reward": 0.33000001311302185, + "reward_std": 0.5561413764953613, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5339342355728149, + "sampling/importance_sampling_ratio/max": 1.0194271802902222, + "sampling/importance_sampling_ratio/mean": 0.7991744875907898, + "sampling/importance_sampling_ratio/min": 0.33269256353378296, + "sampling/sampling_logp_difference/max": 0.33765721321105957, + "sampling/sampling_logp_difference/mean": 0.026410941034555435, + "step": 54, + "step_time": 83.15403410801082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3604234457015991, + "epoch": 0.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8578059077262878, + "kl": 0.0011407495476305485, + "learning_rate": 4.9924513359799555e-06, + "loss": 0.0794, + "num_tokens": 309275.0, + "reward": 0.33250001072883606, + "reward_std": 0.5635805130004883, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.5450229048728943, + "sampling/importance_sampling_ratio/max": 1.3248019218444824, + "sampling/importance_sampling_ratio/mean": 0.8565744161605835, + "sampling/importance_sampling_ratio/min": 0.42319124937057495, + "sampling/sampling_logp_difference/max": 0.5236988067626953, + "sampling/sampling_logp_difference/mean": 0.022501163184642792, + "step": 55, + "step_time": 73.6418833520147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3908393681049347, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7073943614959717, + "kl": 0.003342224285006523, + "learning_rate": 4.991809526186424e-06, + "loss": -0.2814, + "num_tokens": 314307.0, + "reward": 0.32375001907348633, + "reward_std": 0.5598282814025879, + "rewards/reward_func/mean": 0.32375001907348633, + "rewards/reward_func/std": 0.5455518364906311, + "sampling/importance_sampling_ratio/max": 2.344586133956909, + "sampling/importance_sampling_ratio/mean": 1.196304440498352, + "sampling/importance_sampling_ratio/min": 0.5245997309684753, + "sampling/sampling_logp_difference/max": 1.1410305500030518, + "sampling/sampling_logp_difference/mean": 0.02790486253798008, + "step": 56, + "step_time": 90.67011975299101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 55.5, + "completions/mean_terminated_length": 55.5, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.34012845158576965, + "epoch": 0.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8670101761817932, + "kl": 0.0026079914532601833, + "learning_rate": 4.991141578490066e-06, + "loss": -0.1396, + "num_tokens": 320264.0, + "reward": 0.07624999433755875, + "reward_std": 0.29174044728279114, + "rewards/reward_func/mean": 0.07624999433755875, + "rewards/reward_func/std": 0.37625741958618164, + "sampling/importance_sampling_ratio/max": 1.4562015533447266, + "sampling/importance_sampling_ratio/mean": 0.7509514689445496, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.34556615352630615, + "sampling/sampling_logp_difference/mean": 0.02316705882549286, + "step": 57, + "step_time": 96.33522103502764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 53.125, + "completions/mean_terminated_length": 53.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3367074131965637, + "epoch": 0.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0496222972869873, + "kl": 0.0054735890589654446, + "learning_rate": 4.990447499897339e-06, + "loss": 0.1168, + "num_tokens": 325728.0, + "reward": 0.17374999821186066, + "reward_std": 0.538986086845398, + "rewards/reward_func/mean": 0.17374999821186066, + "rewards/reward_func/std": 0.49951228499412537, + "sampling/importance_sampling_ratio/max": 1.447581171989441, + "sampling/importance_sampling_ratio/mean": 0.9637683629989624, + "sampling/importance_sampling_ratio/min": 0.6208034157752991, + "sampling/sampling_logp_difference/max": 0.4196118116378784, + "sampling/sampling_logp_difference/mean": 0.023549657315015793, + "step": 58, + "step_time": 97.37826122099068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 49.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.36870628595352173, + "epoch": 0.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3209524154663086, + "kl": 0.002333354204893112, + "learning_rate": 4.989727297688797e-06, + "loss": 0.1813, + "num_tokens": 331317.0, + "reward": 0.3449999988079071, + "reward_std": 0.5488909482955933, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5325947403907776, + "sampling/importance_sampling_ratio/max": 1.9513617753982544, + "sampling/importance_sampling_ratio/mean": 0.996979296207428, + "sampling/importance_sampling_ratio/min": 0.6756687760353088, + "sampling/sampling_logp_difference/max": 0.4977457523345947, + "sampling/sampling_logp_difference/mean": 0.02654324471950531, + "step": 59, + "step_time": 92.74966769100865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3398672938346863, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9856535792350769, + "kl": 0.0027454416267573833, + "learning_rate": 4.98898097941902e-06, + "loss": 0.0051, + "num_tokens": 336444.0, + "reward": -0.06875000149011612, + "reward_std": 0.06034637242555618, + "rewards/reward_func/mean": -0.06875000149011612, + "rewards/reward_func/std": 0.05667892098426819, + "sampling/importance_sampling_ratio/max": 1.05806303024292, + "sampling/importance_sampling_ratio/mean": 0.7077381014823914, + "sampling/importance_sampling_ratio/min": 0.29610589146614075, + "sampling/sampling_logp_difference/max": 0.538194477558136, + "sampling/sampling_logp_difference/mean": 0.026932962238788605, + "step": 60, + "step_time": 85.23989409799105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3458422124385834, + "epoch": 0.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7404291033744812, + "kl": 0.0024542706087231636, + "learning_rate": 4.988208552916535e-06, + "loss": 0.0047, + "num_tokens": 341913.0, + "reward": 0.06750001013278961, + "reward_std": 0.2946765422821045, + "rewards/reward_func/mean": 0.06750001013278961, + "rewards/reward_func/std": 0.38231438398361206, + "sampling/importance_sampling_ratio/max": 1.2514207363128662, + "sampling/importance_sampling_ratio/mean": 0.8571313619613647, + "sampling/importance_sampling_ratio/min": 0.37189623713493347, + "sampling/sampling_logp_difference/max": 0.32369494438171387, + "sampling/sampling_logp_difference/mean": 0.01881476677954197, + "step": 61, + "step_time": 81.40806046701618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3777502477169037, + "epoch": 0.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5111892223358154, + "kl": 0.0038244668394327164, + "learning_rate": 4.98741002628373e-06, + "loss": -0.0188, + "num_tokens": 347236.0, + "reward": 0.6025000214576721, + "reward_std": 0.5539374351501465, + "rewards/reward_func/mean": 0.6025000214576721, + "rewards/reward_func/std": 0.5297641158103943, + "sampling/importance_sampling_ratio/max": 2.0949490070343018, + "sampling/importance_sampling_ratio/mean": 1.2000421285629272, + "sampling/importance_sampling_ratio/min": 0.6139946579933167, + "sampling/sampling_logp_difference/max": 0.6703405380249023, + "sampling/sampling_logp_difference/mean": 0.02322327345609665, + "step": 62, + "step_time": 69.72723935198155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3558458089828491, + "epoch": 0.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.667698085308075, + "kl": 0.0018400326371192932, + "learning_rate": 4.9865854078967715e-06, + "loss": 0.0829, + "num_tokens": 352685.0, + "reward": 0.5875000357627869, + "reward_std": 0.5716196298599243, + "rewards/reward_func/mean": 0.5875000357627869, + "rewards/reward_func/std": 0.5452063679695129, + "sampling/importance_sampling_ratio/max": 0.9193384647369385, + "sampling/importance_sampling_ratio/mean": 0.632682740688324, + "sampling/importance_sampling_ratio/min": 0.3342023193836212, + "sampling/sampling_logp_difference/max": 0.8547244071960449, + "sampling/sampling_logp_difference/mean": 0.022451236844062805, + "step": 63, + "step_time": 64.33397425999283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 46.625, + "completions/mean_terminated_length": 46.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.36075106263160706, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6670936346054077, + "kl": 0.0064406488090753555, + "learning_rate": 4.985734706405516e-06, + "loss": -0.1783, + "num_tokens": 358643.0, + "reward": 0.08750000596046448, + "reward_std": 0.2707710862159729, + "rewards/reward_func/mean": 0.08750000596046448, + "rewards/reward_func/std": 0.3586781620979309, + "sampling/importance_sampling_ratio/max": 2.078122615814209, + "sampling/importance_sampling_ratio/mean": 1.245069980621338, + "sampling/importance_sampling_ratio/min": 0.5846289992332458, + "sampling/sampling_logp_difference/max": 0.3692970275878906, + "sampling/sampling_logp_difference/mean": 0.022641174495220184, + "step": 64, + "step_time": 90.54826116497861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34251487255096436, + "epoch": 0.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6975246071815491, + "kl": 0.0017276068683713675, + "learning_rate": 4.9848579307334195e-06, + "loss": 0.0456, + "num_tokens": 365099.0, + "reward": 0.2199999988079071, + "reward_std": 0.4802777171134949, + "rewards/reward_func/mean": 0.2199999988079071, + "rewards/reward_func/std": 0.44468289613723755, + "sampling/importance_sampling_ratio/max": 1.202215313911438, + "sampling/importance_sampling_ratio/mean": 0.8249953985214233, + "sampling/importance_sampling_ratio/min": 0.5363028645515442, + "sampling/sampling_logp_difference/max": 0.45500755310058594, + "sampling/sampling_logp_difference/mean": 0.01933646947145462, + "step": 65, + "step_time": 85.8507220740139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.35193923115730286, + "epoch": 0.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0195058584213257, + "kl": 0.0025058817118406296, + "learning_rate": 4.983955090077445e-06, + "loss": -0.0506, + "num_tokens": 369968.0, + "reward": 0.20749999582767487, + "reward_std": 0.5109157562255859, + "rewards/reward_func/mean": 0.20749999582767487, + "rewards/reward_func/std": 0.4736107587814331, + "sampling/importance_sampling_ratio/max": 1.2251256704330444, + "sampling/importance_sampling_ratio/mean": 0.9472914934158325, + "sampling/importance_sampling_ratio/min": 0.8227061629295349, + "sampling/sampling_logp_difference/max": 0.31458473205566406, + "sampling/sampling_logp_difference/mean": 0.017272518947720528, + "step": 66, + "step_time": 76.18023397601792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.36691945791244507, + "epoch": 0.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0082647800445557, + "kl": 0.0018996293656527996, + "learning_rate": 4.983026193907962e-06, + "loss": 0.1843, + "num_tokens": 375164.0, + "reward": 0.21249999105930328, + "reward_std": 0.319685161113739, + "rewards/reward_func/mean": 0.21249999105930328, + "rewards/reward_func/std": 0.4872591197490692, + "sampling/importance_sampling_ratio/max": 1.7669250965118408, + "sampling/importance_sampling_ratio/mean": 0.8863449692726135, + "sampling/importance_sampling_ratio/min": 0.26863494515419006, + "sampling/sampling_logp_difference/max": 0.3789827823638916, + "sampling/sampling_logp_difference/mean": 0.02467949688434601, + "step": 67, + "step_time": 93.1446157169994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.35669732093811035, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8750259280204773, + "kl": 0.0033818050287663937, + "learning_rate": 4.982071251968653e-06, + "loss": 0.051, + "num_tokens": 380504.0, + "reward": 0.32875001430511475, + "reward_std": 0.5622336864471436, + "rewards/reward_func/mean": 0.32875001430511475, + "rewards/reward_func/std": 0.5360020399093628, + "sampling/importance_sampling_ratio/max": 1.2201801538467407, + "sampling/importance_sampling_ratio/mean": 0.8601148724555969, + "sampling/importance_sampling_ratio/min": 0.6346798539161682, + "sampling/sampling_logp_difference/max": 0.3918271064758301, + "sampling/sampling_logp_difference/mean": 0.023806363344192505, + "step": 68, + "step_time": 72.7238112029736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.4199118912220001, + "epoch": 0.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0077135562896729, + "kl": 0.00302119180560112, + "learning_rate": 4.981090274276406e-06, + "loss": 0.1156, + "num_tokens": 386315.0, + "reward": 0.059999994933605194, + "reward_std": 0.2796437740325928, + "rewards/reward_func/mean": 0.059999994933605194, + "rewards/reward_func/std": 0.36245197057724, + "sampling/importance_sampling_ratio/max": 1.8291817903518677, + "sampling/importance_sampling_ratio/mean": 0.9294091463088989, + "sampling/importance_sampling_ratio/min": 0.3144456446170807, + "sampling/sampling_logp_difference/max": 0.610379695892334, + "sampling/sampling_logp_difference/mean": 0.031378112733364105, + "step": 69, + "step_time": 88.02450225598295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 57.25, + "completions/mean_terminated_length": 57.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.38176238536834717, + "epoch": 0.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.892582356929779, + "kl": 0.0031344564631581306, + "learning_rate": 4.980083271121215e-06, + "loss": -0.1972, + "num_tokens": 391929.0, + "reward": 0.2150000035762787, + "reward_std": 0.50013267993927, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.46398892998695374, + "sampling/importance_sampling_ratio/max": 1.8030683994293213, + "sampling/importance_sampling_ratio/mean": 0.9491258859634399, + "sampling/importance_sampling_ratio/min": 0.3493870496749878, + "sampling/sampling_logp_difference/max": 0.4494798183441162, + "sampling/sampling_logp_difference/mean": 0.02591659128665924, + "step": 70, + "step_time": 71.29640350298723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.4286239743232727, + "epoch": 0.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2103837728500366, + "kl": 0.0030327460262924433, + "learning_rate": 4.979050253066064e-06, + "loss": -0.1033, + "num_tokens": 397951.0, + "reward": 0.1875, + "reward_std": 0.344761461019516, + "rewards/reward_func/mean": 0.1875, + "rewards/reward_func/std": 0.48029011487960815, + "sampling/importance_sampling_ratio/max": 1.681307315826416, + "sampling/importance_sampling_ratio/mean": 0.8077924847602844, + "sampling/importance_sampling_ratio/min": 0.35340648889541626, + "sampling/sampling_logp_difference/max": 0.5255258083343506, + "sampling/sampling_logp_difference/mean": 0.02561108022928238, + "step": 71, + "step_time": 94.12945175101049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3729614019393921, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875525951385498, + "kl": 0.001732141012325883, + "learning_rate": 4.977991230946824e-06, + "loss": -0.0218, + "num_tokens": 402966.0, + "reward": 0.34375, + "reward_std": 0.5594232082366943, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5332096219062805, + "sampling/importance_sampling_ratio/max": 1.3324414491653442, + "sampling/importance_sampling_ratio/mean": 0.9489821195602417, + "sampling/importance_sampling_ratio/min": 0.6200194358825684, + "sampling/sampling_logp_difference/max": 0.282620906829834, + "sampling/sampling_logp_difference/mean": 0.021304737776517868, + "step": 72, + "step_time": 68.7971295939933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.32240110635757446, + "epoch": 0.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.950322151184082, + "kl": 0.0028352453373372555, + "learning_rate": 4.976906215872137e-06, + "loss": 0.0467, + "num_tokens": 409055.0, + "reward": 0.3387500047683716, + "reward_std": 0.2871127426624298, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5407254099845886, + "sampling/importance_sampling_ratio/max": 1.2006139755249023, + "sampling/importance_sampling_ratio/mean": 0.8248869180679321, + "sampling/importance_sampling_ratio/min": 0.4684114456176758, + "sampling/sampling_logp_difference/max": 0.43263185024261475, + "sampling/sampling_logp_difference/mean": 0.023945681750774384, + "step": 73, + "step_time": 102.22122251600376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3135406970977783, + "epoch": 0.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.154463529586792, + "kl": 0.03127220273017883, + "learning_rate": 4.975795219223299e-06, + "loss": -0.0935, + "num_tokens": 414402.0, + "reward": 0.3412500023841858, + "reward_std": 0.5579792261123657, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.535975456237793, + "sampling/importance_sampling_ratio/max": 2.067894220352173, + "sampling/importance_sampling_ratio/mean": 0.9438801407814026, + "sampling/importance_sampling_ratio/min": 0.4140065908432007, + "sampling/sampling_logp_difference/max": 0.4713999032974243, + "sampling/sampling_logp_difference/mean": 0.02523641288280487, + "step": 74, + "step_time": 89.00602476199856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.32595470547676086, + "epoch": 0.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8300859332084656, + "kl": 0.007171849254518747, + "learning_rate": 4.974658252654135e-06, + "loss": -0.0902, + "num_tokens": 419796.0, + "reward": 0.48250001668930054, + "reward_std": 0.5001860857009888, + "rewards/reward_func/mean": 0.48250001668930054, + "rewards/reward_func/std": 0.5347295999526978, + "sampling/importance_sampling_ratio/max": 1.4946962594985962, + "sampling/importance_sampling_ratio/mean": 0.8154863119125366, + "sampling/importance_sampling_ratio/min": 0.3788175582885742, + "sampling/sampling_logp_difference/max": 0.7174708843231201, + "sampling/sampling_logp_difference/mean": 0.021303167566657066, + "step": 75, + "step_time": 59.640716498019174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.41575515270233154, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4510153532028198, + "kl": 0.011087974533438683, + "learning_rate": 4.973495328090891e-06, + "loss": 0.1287, + "num_tokens": 424726.0, + "reward": 0.3387500047683716, + "reward_std": 0.5444081425666809, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5208355784416199, + "sampling/importance_sampling_ratio/max": 1.5896728038787842, + "sampling/importance_sampling_ratio/mean": 1.0859061479568481, + "sampling/importance_sampling_ratio/min": 0.718471348285675, + "sampling/sampling_logp_difference/max": 0.6861748695373535, + "sampling/sampling_logp_difference/mean": 0.02493377774953842, + "step": 76, + "step_time": 70.72388471697923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 42.875, + "completions/mean_terminated_length": 42.875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3451007008552551, + "epoch": 0.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7033120393753052, + "kl": 0.006896092556416988, + "learning_rate": 4.972306457732091e-06, + "loss": 0.1233, + "num_tokens": 429957.0, + "reward": 0.09125000238418579, + "reward_std": 0.2647804319858551, + "rewards/reward_func/mean": 0.09125000238418579, + "rewards/reward_func/std": 0.36841118335723877, + "sampling/importance_sampling_ratio/max": 1.4747828245162964, + "sampling/importance_sampling_ratio/mean": 1.0489879846572876, + "sampling/importance_sampling_ratio/min": 0.6281050443649292, + "sampling/sampling_logp_difference/max": 0.8355374336242676, + "sampling/sampling_logp_difference/mean": 0.02908758632838726, + "step": 77, + "step_time": 81.86546373798046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3363872170448303, + "epoch": 0.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0083401203155518, + "kl": 0.003795074066147208, + "learning_rate": 4.971091654048427e-06, + "loss": 0.3044, + "num_tokens": 436347.0, + "reward": 0.07499999552965164, + "reward_std": 0.2905214726924896, + "rewards/reward_func/mean": 0.07499999552965164, + "rewards/reward_func/std": 0.378644198179245, + "sampling/importance_sampling_ratio/max": 2.2779664993286133, + "sampling/importance_sampling_ratio/mean": 1.063035249710083, + "sampling/importance_sampling_ratio/min": 0.37523871660232544, + "sampling/sampling_logp_difference/max": 0.3972114324569702, + "sampling/sampling_logp_difference/mean": 0.02419961616396904, + "step": 78, + "step_time": 88.4848685679899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3581341505050659, + "epoch": 0.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.572177767753601, + "kl": 0.0030747244600206614, + "learning_rate": 4.96985092978261e-06, + "loss": -0.0336, + "num_tokens": 441325.0, + "reward": 0.051249999552965164, + "reward_std": 0.3115207850933075, + "rewards/reward_func/mean": 0.051249999552965164, + "rewards/reward_func/std": 0.3852434754371643, + "sampling/importance_sampling_ratio/max": 2.044938087463379, + "sampling/importance_sampling_ratio/mean": 0.971229076385498, + "sampling/importance_sampling_ratio/min": 0.1840338557958603, + "sampling/sampling_logp_difference/max": 0.6103886365890503, + "sampling/sampling_logp_difference/mean": 0.02984805777668953, + "step": 79, + "step_time": 92.6209842649987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.32628703117370605, + "epoch": 0.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8613694906234741, + "kl": 0.0033719956409186125, + "learning_rate": 4.968584297949255e-06, + "loss": -0.0019, + "num_tokens": 446935.0, + "reward": 0.4662500023841858, + "reward_std": 0.5965955257415771, + "rewards/reward_func/mean": 0.4662500023841858, + "rewards/reward_func/std": 0.5527060627937317, + "sampling/importance_sampling_ratio/max": 2.1245856285095215, + "sampling/importance_sampling_ratio/mean": 0.8101105690002441, + "sampling/importance_sampling_ratio/min": 0.354059100151062, + "sampling/sampling_logp_difference/max": 0.4604175090789795, + "sampling/sampling_logp_difference/mean": 0.024325117468833923, + "step": 80, + "step_time": 91.05656213400653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 58.125, + "completions/mean_terminated_length": 58.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3670700788497925, + "epoch": 0.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2181848287582397, + "kl": 0.002382858656346798, + "learning_rate": 4.967291771834727e-06, + "loss": -0.2348, + "num_tokens": 452473.0, + "reward": 0.15125001966953278, + "reward_std": 0.3374948501586914, + "rewards/reward_func/mean": 0.15125001966953278, + "rewards/reward_func/std": 0.4997267425060272, + "sampling/importance_sampling_ratio/max": 2.508380174636841, + "sampling/importance_sampling_ratio/mean": 1.2941944599151611, + "sampling/importance_sampling_ratio/min": 0.646767258644104, + "sampling/sampling_logp_difference/max": 0.3313436508178711, + "sampling/sampling_logp_difference/mean": 0.02512197196483612, + "step": 81, + "step_time": 90.79619163498865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.2903425693511963, + "epoch": 0.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2693836688995361, + "kl": 0.0025903629139065742, + "learning_rate": 4.965973364997015e-06, + "loss": -0.0367, + "num_tokens": 458523.0, + "reward": 0.17125000059604645, + "reward_std": 0.3282886743545532, + "rewards/reward_func/mean": 0.17125000059604645, + "rewards/reward_func/std": 0.499898225069046, + "sampling/importance_sampling_ratio/max": 1.9814573526382446, + "sampling/importance_sampling_ratio/mean": 0.9206903576850891, + "sampling/importance_sampling_ratio/min": 0.2931478023529053, + "sampling/sampling_logp_difference/max": 0.4033019542694092, + "sampling/sampling_logp_difference/mean": 0.02239578776061535, + "step": 82, + "step_time": 101.29266464500688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3330921530723572, + "epoch": 0.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2236757278442383, + "kl": 0.003864692524075508, + "learning_rate": 4.964629091265583e-06, + "loss": -0.0728, + "num_tokens": 463684.0, + "reward": 0.4675000011920929, + "reward_std": 0.5979688763618469, + "rewards/reward_func/mean": 0.4675000011920929, + "rewards/reward_func/std": 0.5541208982467651, + "sampling/importance_sampling_ratio/max": 1.6764252185821533, + "sampling/importance_sampling_ratio/mean": 1.0374202728271484, + "sampling/importance_sampling_ratio/min": 0.6156142950057983, + "sampling/sampling_logp_difference/max": 0.5306464433670044, + "sampling/sampling_logp_difference/mean": 0.02434811368584633, + "step": 83, + "step_time": 65.75530088201049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3707743287086487, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0121077299118042, + "kl": 0.006942209787666798, + "learning_rate": 4.963258964741227e-06, + "loss": 0.1128, + "num_tokens": 468918.0, + "reward": 0.3462499976158142, + "reward_std": 0.5688021183013916, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5434529185295105, + "sampling/importance_sampling_ratio/max": 1.8691112995147705, + "sampling/importance_sampling_ratio/mean": 0.9797255992889404, + "sampling/importance_sampling_ratio/min": 0.19412098824977875, + "sampling/sampling_logp_difference/max": 0.6592090129852295, + "sampling/sampling_logp_difference/mean": 0.027863148599863052, + "step": 84, + "step_time": 69.84349327700329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.350742369890213, + "epoch": 0.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.236136794090271, + "kl": 0.0026309723034501076, + "learning_rate": 4.961862999795923e-06, + "loss": 0.0105, + "num_tokens": 474878.0, + "reward": 0.061250001192092896, + "reward_std": 0.2900194525718689, + "rewards/reward_func/mean": 0.061250001192092896, + "rewards/reward_func/std": 0.3823774456977844, + "sampling/importance_sampling_ratio/max": 2.504836320877075, + "sampling/importance_sampling_ratio/mean": 1.2779099941253662, + "sampling/importance_sampling_ratio/min": 0.65166175365448, + "sampling/sampling_logp_difference/max": 0.5060451030731201, + "sampling/sampling_logp_difference/mean": 0.021245911717414856, + "step": 85, + "step_time": 77.75256623400492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.36720341444015503, + "epoch": 0.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3172496557235718, + "kl": 0.0030011916533112526, + "learning_rate": 4.960441211072686e-06, + "loss": -0.1479, + "num_tokens": 480065.0, + "reward": 0.4399999976158142, + "reward_std": 0.5658103227615356, + "rewards/reward_func/mean": 0.4399999976158142, + "rewards/reward_func/std": 0.5238865613937378, + "sampling/importance_sampling_ratio/max": 2.6345438957214355, + "sampling/importance_sampling_ratio/mean": 1.2061142921447754, + "sampling/importance_sampling_ratio/min": 0.6962835192680359, + "sampling/sampling_logp_difference/max": 0.3562922477722168, + "sampling/sampling_logp_difference/mean": 0.023759279400110245, + "step": 86, + "step_time": 79.76976580297924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3295614421367645, + "epoch": 0.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2377467155456543, + "kl": 0.0017705978825688362, + "learning_rate": 4.958993613485406e-06, + "loss": 0.1347, + "num_tokens": 485174.0, + "reward": -0.05000000074505806, + "reward_std": 0.03639974445104599, + "rewards/reward_func/mean": -0.05000000074505806, + "rewards/reward_func/std": 0.041403934359550476, + "sampling/importance_sampling_ratio/max": 2.701768398284912, + "sampling/importance_sampling_ratio/mean": 1.1716480255126953, + "sampling/importance_sampling_ratio/min": 0.710328221321106, + "sampling/sampling_logp_difference/max": 0.3339419364929199, + "sampling/sampling_logp_difference/mean": 0.02282622456550598, + "step": 87, + "step_time": 98.58460397701128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3728080689907074, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9409402012825012, + "kl": 0.004020972643047571, + "learning_rate": 4.957520222218695e-06, + "loss": -0.2078, + "num_tokens": 491186.0, + "reward": 0.20624999701976776, + "reward_std": 0.3007173538208008, + "rewards/reward_func/mean": 0.20624999701976776, + "rewards/reward_func/std": 0.4542478024959564, + "sampling/importance_sampling_ratio/max": 1.1913342475891113, + "sampling/importance_sampling_ratio/mean": 0.8587566018104553, + "sampling/importance_sampling_ratio/min": 0.5435622930526733, + "sampling/sampling_logp_difference/max": 0.33767926692962646, + "sampling/sampling_logp_difference/mean": 0.02355227991938591, + "step": 88, + "step_time": 83.78908705202048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.41878965497016907, + "epoch": 0.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9201162457466125, + "kl": 0.0033999462611973286, + "learning_rate": 4.956021052727731e-06, + "loss": -0.0817, + "num_tokens": 497013.0, + "reward": 0.09125000238418579, + "reward_std": 0.2839151620864868, + "rewards/reward_func/mean": 0.09125000238418579, + "rewards/reward_func/std": 0.369727224111557, + "sampling/importance_sampling_ratio/max": 1.4009984731674194, + "sampling/importance_sampling_ratio/mean": 0.8987118005752563, + "sampling/importance_sampling_ratio/min": 0.636340320110321, + "sampling/sampling_logp_difference/max": 0.3343019485473633, + "sampling/sampling_logp_difference/mean": 0.02563662827014923, + "step": 89, + "step_time": 76.07593618502142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3703194260597229, + "epoch": 0.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0612517595291138, + "kl": 0.00556425005197525, + "learning_rate": 4.954496120738094e-06, + "loss": 0.1675, + "num_tokens": 502431.0, + "reward": 0.3462499976158142, + "reward_std": 0.5637357234954834, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5389921069145203, + "sampling/importance_sampling_ratio/max": 1.7256437540054321, + "sampling/importance_sampling_ratio/mean": 0.9468981027603149, + "sampling/importance_sampling_ratio/min": 0.606105387210846, + "sampling/sampling_logp_difference/max": 0.343442440032959, + "sampling/sampling_logp_difference/mean": 0.022628474980592728, + "step": 90, + "step_time": 75.86889905299176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3772880434989929, + "epoch": 0.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3590164184570312, + "kl": 0.0036642742343246937, + "learning_rate": 4.952945442245598e-06, + "loss": -0.2427, + "num_tokens": 508352.0, + "reward": 0.07499998807907104, + "reward_std": 0.28367576003074646, + "rewards/reward_func/mean": 0.07499998807907104, + "rewards/reward_func/std": 0.3612280488014221, + "sampling/importance_sampling_ratio/max": 1.6069527864456177, + "sampling/importance_sampling_ratio/mean": 0.8528153300285339, + "sampling/importance_sampling_ratio/min": 0.2771243453025818, + "sampling/sampling_logp_difference/max": 0.6460120677947998, + "sampling/sampling_logp_difference/mean": 0.026305314153432846, + "step": 91, + "step_time": 101.26391361499554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3880101442337036, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.07570219039917, + "kl": 0.0035898014903068542, + "learning_rate": 4.951369033516127e-06, + "loss": -0.0628, + "num_tokens": 513922.0, + "reward": 0.3474999964237213, + "reward_std": 0.538013756275177, + "rewards/reward_func/mean": 0.3474999964237213, + "rewards/reward_func/std": 0.5158834457397461, + "sampling/importance_sampling_ratio/max": 2.46356201171875, + "sampling/importance_sampling_ratio/mean": 1.4318658113479614, + "sampling/importance_sampling_ratio/min": 0.7919402122497559, + "sampling/sampling_logp_difference/max": 0.5595130920410156, + "sampling/sampling_logp_difference/mean": 0.021545136347413063, + "step": 92, + "step_time": 89.50697983897408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3597927391529083, + "epoch": 0.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7946093082427979, + "kl": 0.00549793615937233, + "learning_rate": 4.949766911085461e-06, + "loss": -0.0008, + "num_tokens": 519677.0, + "reward": 0.2150000035762787, + "reward_std": 0.5236697196960449, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.48529812693595886, + "sampling/importance_sampling_ratio/max": 1.5985785722732544, + "sampling/importance_sampling_ratio/mean": 0.9824653267860413, + "sampling/importance_sampling_ratio/min": 0.5499786734580994, + "sampling/sampling_logp_difference/max": 0.40012407302856445, + "sampling/sampling_logp_difference/mean": 0.020420320332050323, + "step": 93, + "step_time": 75.89820895600133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.37839484214782715, + "epoch": 0.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6853567957878113, + "kl": 0.003214706666767597, + "learning_rate": 4.948139091759108e-06, + "loss": 0.2257, + "num_tokens": 525806.0, + "reward": 0.19624999165534973, + "reward_std": 0.5386360883712769, + "rewards/reward_func/mean": 0.19624999165534973, + "rewards/reward_func/std": 0.4986822009086609, + "sampling/importance_sampling_ratio/max": 2.103787422180176, + "sampling/importance_sampling_ratio/mean": 1.0454094409942627, + "sampling/importance_sampling_ratio/min": 0.4870206117630005, + "sampling/sampling_logp_difference/max": 0.3358621597290039, + "sampling/sampling_logp_difference/mean": 0.01956326514482498, + "step": 94, + "step_time": 85.72357106002164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3306207060813904, + "epoch": 0.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1758936643600464, + "kl": 0.002643125131726265, + "learning_rate": 4.946485592612122e-06, + "loss": -0.1909, + "num_tokens": 531520.0, + "reward": 0.06624999642372131, + "reward_std": 0.3067837357521057, + "rewards/reward_func/mean": 0.06624999642372131, + "rewards/reward_func/std": 0.38149845600128174, + "sampling/importance_sampling_ratio/max": 1.9292004108428955, + "sampling/importance_sampling_ratio/mean": 1.0738458633422852, + "sampling/importance_sampling_ratio/min": 0.6487919688224792, + "sampling/sampling_logp_difference/max": 0.3406977653503418, + "sampling/sampling_logp_difference/mean": 0.0220349058508873, + "step": 95, + "step_time": 77.85138512399863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.312557190656662, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1386572122573853, + "kl": 0.0068060653284192085, + "learning_rate": 4.944806430988927e-06, + "loss": -0.2, + "num_tokens": 536890.0, + "reward": 0.33250001072883606, + "reward_std": 0.5673606991767883, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.5434480309486389, + "sampling/importance_sampling_ratio/max": 1.4778003692626953, + "sampling/importance_sampling_ratio/mean": 0.9165278673171997, + "sampling/importance_sampling_ratio/min": 0.373668909072876, + "sampling/sampling_logp_difference/max": 0.5655592679977417, + "sampling/sampling_logp_difference/mean": 0.024791110306978226, + "step": 96, + "step_time": 62.66469675899134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3480789363384247, + "epoch": 0.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8446367979049683, + "kl": 0.009365051984786987, + "learning_rate": 4.943101624503133e-06, + "loss": 0.1505, + "num_tokens": 542424.0, + "reward": -0.07750000059604645, + "reward_std": 0.06912855058908463, + "rewards/reward_func/mean": -0.07750000059604645, + "rewards/reward_func/std": 0.06453128159046173, + "sampling/importance_sampling_ratio/max": 1.3271279335021973, + "sampling/importance_sampling_ratio/mean": 0.8955328464508057, + "sampling/importance_sampling_ratio/min": 0.2967626452445984, + "sampling/sampling_logp_difference/max": 0.4783933162689209, + "sampling/sampling_logp_difference/mean": 0.025432758033275604, + "step": 97, + "step_time": 121.15107800901751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3962988257408142, + "epoch": 0.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7532799243927002, + "kl": 0.0025938255712389946, + "learning_rate": 4.941371191037353e-06, + "loss": 0.5137, + "num_tokens": 548175.0, + "reward": 0.09125000238418579, + "reward_std": 0.27026599645614624, + "rewards/reward_func/mean": 0.09125000238418579, + "rewards/reward_func/std": 0.35750874876976013, + "sampling/importance_sampling_ratio/max": 2.1303369998931885, + "sampling/importance_sampling_ratio/mean": 1.1401922702789307, + "sampling/importance_sampling_ratio/min": 0.46015465259552, + "sampling/sampling_logp_difference/max": 0.5732070207595825, + "sampling/sampling_logp_difference/mean": 0.028814753517508507, + "step": 98, + "step_time": 93.41499740499421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.36940494179725647, + "epoch": 0.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0608575344085693, + "kl": 0.003493869910016656, + "learning_rate": 4.939615148743017e-06, + "loss": -0.1552, + "num_tokens": 553527.0, + "reward": 0.20625001192092896, + "reward_std": 0.5164605379104614, + "rewards/reward_func/mean": 0.20625001192092896, + "rewards/reward_func/std": 0.4783584475517273, + "sampling/importance_sampling_ratio/max": 1.2723692655563354, + "sampling/importance_sampling_ratio/mean": 0.8784043788909912, + "sampling/importance_sampling_ratio/min": 0.5372451543807983, + "sampling/sampling_logp_difference/max": 0.5341734886169434, + "sampling/sampling_logp_difference/mean": 0.02111111767590046, + "step": 99, + "step_time": 90.65784694100148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3440898358821869, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1309008598327637, + "kl": 0.0054893046617507935, + "learning_rate": 4.937833516040177e-06, + "loss": 0.0197, + "num_tokens": 559881.0, + "reward": 0.29875001311302185, + "reward_std": 0.5750788450241089, + "rewards/reward_func/mean": 0.29875001311302185, + "rewards/reward_func/std": 0.5545767545700073, + "sampling/importance_sampling_ratio/max": 1.1451483964920044, + "sampling/importance_sampling_ratio/mean": 0.8518111705780029, + "sampling/importance_sampling_ratio/min": 0.6277573108673096, + "sampling/sampling_logp_difference/max": 0.48022013902664185, + "sampling/sampling_logp_difference/mean": 0.024066496640443802, + "step": 100, + "step_time": 103.06095111800823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3469806909561157, + "epoch": 0.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7262893319129944, + "kl": 0.0019808816723525524, + "learning_rate": 4.936026311617316e-06, + "loss": 0.0341, + "num_tokens": 565252.0, + "reward": 0.040000006556510925, + "reward_std": 0.3062291443347931, + "rewards/reward_func/mean": 0.040000006556510925, + "rewards/reward_func/std": 0.3904210329055786, + "sampling/importance_sampling_ratio/max": 2.1688106060028076, + "sampling/importance_sampling_ratio/mean": 1.0679666996002197, + "sampling/importance_sampling_ratio/min": 0.3155788481235504, + "sampling/sampling_logp_difference/max": 0.7815747261047363, + "sampling/sampling_logp_difference/mean": 0.02425907365977764, + "step": 101, + "step_time": 103.25376764600514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34851688146591187, + "epoch": 0.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0761581659317017, + "kl": 0.006346363108605146, + "learning_rate": 4.9341935544311536e-06, + "loss": -0.0727, + "num_tokens": 570076.0, + "reward": 0.4737500250339508, + "reward_std": 0.5936700105667114, + "rewards/reward_func/mean": 0.4737500250339508, + "rewards/reward_func/std": 0.5496476292610168, + "sampling/importance_sampling_ratio/max": 1.647140622138977, + "sampling/importance_sampling_ratio/mean": 1.0042234659194946, + "sampling/importance_sampling_ratio/min": 0.49350976943969727, + "sampling/sampling_logp_difference/max": 0.609084963798523, + "sampling/sampling_logp_difference/mean": 0.025170542299747467, + "step": 102, + "step_time": 64.7651237519749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3427722454071045, + "epoch": 0.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7976966500282288, + "kl": 0.0015114627312868834, + "learning_rate": 4.932335263706446e-06, + "loss": -0.1135, + "num_tokens": 576043.0, + "reward": 0.5950000286102295, + "reward_std": 0.5730479955673218, + "rewards/reward_func/mean": 0.5950000286102295, + "rewards/reward_func/std": 0.5514914989471436, + "sampling/importance_sampling_ratio/max": 1.248305082321167, + "sampling/importance_sampling_ratio/mean": 0.944599986076355, + "sampling/importance_sampling_ratio/min": 0.5915149450302124, + "sampling/sampling_logp_difference/max": 0.2609410285949707, + "sampling/sampling_logp_difference/mean": 0.019696425646543503, + "step": 103, + "step_time": 71.29194252597517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3026958703994751, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.035447359085083, + "kl": 0.002654898911714554, + "learning_rate": 4.930451458935783e-06, + "loss": -0.0804, + "num_tokens": 580966.0, + "reward": 0.4725000262260437, + "reward_std": 0.5602477788925171, + "rewards/reward_func/mean": 0.4725000262260437, + "rewards/reward_func/std": 0.5191407799720764, + "sampling/importance_sampling_ratio/max": 1.4709969758987427, + "sampling/importance_sampling_ratio/mean": 0.9203107357025146, + "sampling/importance_sampling_ratio/min": 0.38464194536209106, + "sampling/sampling_logp_difference/max": 0.44011521339416504, + "sampling/sampling_logp_difference/mean": 0.019846128299832344, + "step": 104, + "step_time": 63.931227530993056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3297388553619385, + "epoch": 0.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1884028911590576, + "kl": 0.0026048035360872746, + "learning_rate": 4.928542159879386e-06, + "loss": 0.0376, + "num_tokens": 586118.0, + "reward": 0.46875, + "reward_std": 0.5946630239486694, + "rewards/reward_func/mean": 0.46875, + "rewards/reward_func/std": 0.550803005695343, + "sampling/importance_sampling_ratio/max": 1.9977771043777466, + "sampling/importance_sampling_ratio/mean": 0.9485726952552795, + "sampling/importance_sampling_ratio/min": 0.21913643181324005, + "sampling/sampling_logp_difference/max": 0.32509803771972656, + "sampling/sampling_logp_difference/mean": 0.021720722317695618, + "step": 105, + "step_time": 83.04879019001964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35155361890792847, + "epoch": 0.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0504732131958008, + "kl": 0.004135879687964916, + "learning_rate": 4.926607386564898e-06, + "loss": 0.1553, + "num_tokens": 591400.0, + "reward": -0.054999999701976776, + "reward_std": 0.04559952765703201, + "rewards/reward_func/mean": -0.054999999701976776, + "rewards/reward_func/std": 0.05903993919491768, + "sampling/importance_sampling_ratio/max": 1.2951682806015015, + "sampling/importance_sampling_ratio/mean": 0.6690744757652283, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.4716770648956299, + "sampling/sampling_logp_difference/mean": 0.02485671453177929, + "step": 106, + "step_time": 82.7186574760126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.34445077180862427, + "epoch": 0.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5816599726676941, + "kl": 0.002885550959035754, + "learning_rate": 4.924647159287176e-06, + "loss": 0.0717, + "num_tokens": 596902.0, + "reward": 0.3525000214576721, + "reward_std": 0.2688867449760437, + "rewards/reward_func/mean": 0.3525000214576721, + "rewards/reward_func/std": 0.5281977653503418, + "sampling/importance_sampling_ratio/max": 1.3920848369598389, + "sampling/importance_sampling_ratio/mean": 0.6316713094711304, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.5735256671905518, + "sampling/sampling_logp_difference/mean": 0.024873752146959305, + "step": 107, + "step_time": 67.83759238000493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.34955301880836487, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4328964948654175, + "kl": 0.001958028180524707, + "learning_rate": 4.922661498608077e-06, + "loss": 0.0188, + "num_tokens": 602106.0, + "reward": 0.4675000309944153, + "reward_std": 0.5285453796386719, + "rewards/reward_func/mean": 0.4675000309944153, + "rewards/reward_func/std": 0.5641112327575684, + "sampling/importance_sampling_ratio/max": 1.7434935569763184, + "sampling/importance_sampling_ratio/mean": 1.167940616607666, + "sampling/importance_sampling_ratio/min": 0.30112484097480774, + "sampling/sampling_logp_difference/max": 0.42384326457977295, + "sampling/sampling_logp_difference/mean": 0.023688288405537605, + "step": 108, + "step_time": 72.5184516950103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3498673439025879, + "epoch": 0.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5927255153656006, + "kl": 0.0023621944710612297, + "learning_rate": 4.920650425356239e-06, + "loss": -0.2113, + "num_tokens": 607347.0, + "reward": -0.0625, + "reward_std": 0.05268768593668938, + "rewards/reward_func/mean": -0.0625, + "rewards/reward_func/std": 0.04978525638580322, + "sampling/importance_sampling_ratio/max": 1.686551809310913, + "sampling/importance_sampling_ratio/mean": 1.2102283239364624, + "sampling/importance_sampling_ratio/min": 0.6830826997756958, + "sampling/sampling_logp_difference/max": 0.3530259132385254, + "sampling/sampling_logp_difference/mean": 0.02098490670323372, + "step": 109, + "step_time": 86.84248229800141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.35680192708969116, + "epoch": 0.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8017518520355225, + "kl": 0.0017795683816075325, + "learning_rate": 4.9186139606268735e-06, + "loss": -0.0156, + "num_tokens": 612704.0, + "reward": 0.07250000536441803, + "reward_std": 0.2962879240512848, + "rewards/reward_func/mean": 0.07250000536441803, + "rewards/reward_func/std": 0.3792756497859955, + "sampling/importance_sampling_ratio/max": 1.3156285285949707, + "sampling/importance_sampling_ratio/mean": 0.9082809686660767, + "sampling/importance_sampling_ratio/min": 0.624051570892334, + "sampling/sampling_logp_difference/max": 0.40149879455566406, + "sampling/sampling_logp_difference/mean": 0.024703415110707283, + "step": 110, + "step_time": 93.84956424098345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.32336217164993286, + "epoch": 0.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3528348207473755, + "kl": 0.002949443645775318, + "learning_rate": 4.916552125781529e-06, + "loss": -0.0099, + "num_tokens": 618399.0, + "reward": 0.45625001192092896, + "reward_std": 0.6083469390869141, + "rewards/reward_func/mean": 0.45625001192092896, + "rewards/reward_func/std": 0.5633810758590698, + "sampling/importance_sampling_ratio/max": 1.8164138793945312, + "sampling/importance_sampling_ratio/mean": 1.0095850229263306, + "sampling/importance_sampling_ratio/min": 0.5931808352470398, + "sampling/sampling_logp_difference/max": 0.5142123699188232, + "sampling/sampling_logp_difference/mean": 0.02279416099190712, + "step": 111, + "step_time": 79.4126727580151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.30478546023368835, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1281598806381226, + "kl": 0.0069794668816030025, + "learning_rate": 4.9144649424478765e-06, + "loss": -0.0498, + "num_tokens": 623892.0, + "reward": 0.08249999582767487, + "reward_std": 0.2825864851474762, + "rewards/reward_func/mean": 0.08249999582767487, + "rewards/reward_func/std": 0.37247246503829956, + "sampling/importance_sampling_ratio/max": 1.6144704818725586, + "sampling/importance_sampling_ratio/mean": 0.8535523414611816, + "sampling/importance_sampling_ratio/min": 0.48705849051475525, + "sampling/sampling_logp_difference/max": 0.6082849502563477, + "sampling/sampling_logp_difference/mean": 0.02124343067407608, + "step": 112, + "step_time": 95.65409678898868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3653002679347992, + "epoch": 0.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7723777294158936, + "kl": 0.008705868385732174, + "learning_rate": 4.912352432519484e-06, + "loss": -0.0089, + "num_tokens": 629286.0, + "reward": 0.0650000050663948, + "reward_std": 0.2788448631763458, + "rewards/reward_func/mean": 0.0650000050663948, + "rewards/reward_func/std": 0.36924636363983154, + "sampling/importance_sampling_ratio/max": 1.065728783607483, + "sampling/importance_sampling_ratio/mean": 0.8029188513755798, + "sampling/importance_sampling_ratio/min": 0.6283921003341675, + "sampling/sampling_logp_difference/max": 0.4079105854034424, + "sampling/sampling_logp_difference/mean": 0.02258678898215294, + "step": 113, + "step_time": 79.44450050298474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3991454541683197, + "epoch": 0.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2725372314453125, + "kl": 0.004002253524959087, + "learning_rate": 4.910214618155579e-06, + "loss": -0.3522, + "num_tokens": 635091.0, + "reward": 0.3412500023841858, + "reward_std": 0.5658435821533203, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.5457481741905212, + "sampling/importance_sampling_ratio/max": 1.7948518991470337, + "sampling/importance_sampling_ratio/mean": 1.1463571786880493, + "sampling/importance_sampling_ratio/min": 0.5749549865722656, + "sampling/sampling_logp_difference/max": 0.36228108406066895, + "sampling/sampling_logp_difference/mean": 0.024780135601758957, + "step": 114, + "step_time": 75.48941349799861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3554052710533142, + "epoch": 0.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4149909019470215, + "kl": 0.00297270598821342, + "learning_rate": 4.908051521780824e-06, + "loss": -0.0461, + "num_tokens": 641015.0, + "reward": 0.21000000834465027, + "reward_std": 0.5283524990081787, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.4891683757305145, + "sampling/importance_sampling_ratio/max": 1.816857933998108, + "sampling/importance_sampling_ratio/mean": 1.0504989624023438, + "sampling/importance_sampling_ratio/min": 0.6830813884735107, + "sampling/sampling_logp_difference/max": 0.25256574153900146, + "sampling/sampling_logp_difference/mean": 0.017991136759519577, + "step": 115, + "step_time": 79.2266922009876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3142127990722656, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1826856136322021, + "kl": 0.005817875266075134, + "learning_rate": 4.905863166085076e-06, + "loss": -0.3008, + "num_tokens": 646381.0, + "reward": 0.33250001072883606, + "reward_std": 0.5690850019454956, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.5483677387237549, + "sampling/importance_sampling_ratio/max": 1.5298662185668945, + "sampling/importance_sampling_ratio/mean": 0.9026192426681519, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.40772104263305664, + "sampling/sampling_logp_difference/mean": 0.021547261625528336, + "step": 116, + "step_time": 83.28613287501503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.37694644927978516, + "epoch": 0.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9839865565299988, + "kl": 0.002937185112386942, + "learning_rate": 4.903649574023151e-06, + "loss": -0.0315, + "num_tokens": 652897.0, + "reward": 0.20125000178813934, + "reward_std": 0.31244680285453796, + "rewards/reward_func/mean": 0.20125000178813934, + "rewards/reward_func/std": 0.4629852771759033, + "sampling/importance_sampling_ratio/max": 1.3970638513565063, + "sampling/importance_sampling_ratio/mean": 0.8803737163543701, + "sampling/importance_sampling_ratio/min": 0.568600058555603, + "sampling/sampling_logp_difference/max": 0.3313124179840088, + "sampling/sampling_logp_difference/mean": 0.02077941596508026, + "step": 117, + "step_time": 92.2992887319997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3316619396209717, + "epoch": 0.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2543243169784546, + "kl": 0.0029075820930302143, + "learning_rate": 4.901410768814581e-06, + "loss": 0.3369, + "num_tokens": 659068.0, + "reward": 0.06875000149011612, + "reward_std": 0.28412488102912903, + "rewards/reward_func/mean": 0.06875000149011612, + "rewards/reward_func/std": 0.36701256036758423, + "sampling/importance_sampling_ratio/max": 1.9490493535995483, + "sampling/importance_sampling_ratio/mean": 1.230026125907898, + "sampling/importance_sampling_ratio/min": 0.5826879739761353, + "sampling/sampling_logp_difference/max": 0.3663163185119629, + "sampling/sampling_logp_difference/mean": 0.02376718446612358, + "step": 118, + "step_time": 81.04622099900735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3243013620376587, + "epoch": 0.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0776394605636597, + "kl": 0.003003204707056284, + "learning_rate": 4.899146773943374e-06, + "loss": 0.0888, + "num_tokens": 664052.0, + "reward": 0.45374998450279236, + "reward_std": 0.5213634967803955, + "rewards/reward_func/mean": 0.45374998450279236, + "rewards/reward_func/std": 0.555361807346344, + "sampling/importance_sampling_ratio/max": 1.163706660270691, + "sampling/importance_sampling_ratio/mean": 0.9491457939147949, + "sampling/importance_sampling_ratio/min": 0.7341592311859131, + "sampling/sampling_logp_difference/max": 0.3309454917907715, + "sampling/sampling_logp_difference/mean": 0.021823348477482796, + "step": 119, + "step_time": 41.72294841398252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.36999672651290894, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.41775381565094, + "kl": 0.002642344683408737, + "learning_rate": 4.896857613157765e-06, + "loss": -0.1831, + "num_tokens": 669769.0, + "reward": 0.06624999642372131, + "reward_std": 0.2767047882080078, + "rewards/reward_func/mean": 0.06624999642372131, + "rewards/reward_func/std": 0.37159648537635803, + "sampling/importance_sampling_ratio/max": 1.3718173503875732, + "sampling/importance_sampling_ratio/mean": 0.9549704194068909, + "sampling/importance_sampling_ratio/min": 0.5532563924789429, + "sampling/sampling_logp_difference/max": 0.5324568748474121, + "sampling/sampling_logp_difference/mean": 0.02652132511138916, + "step": 120, + "step_time": 92.35115976299858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.35806554555892944, + "epoch": 0.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6040545105934143, + "kl": 0.0024742307141423225, + "learning_rate": 4.894543310469968e-06, + "loss": -0.0793, + "num_tokens": 675165.0, + "reward": 0.06000000610947609, + "reward_std": 0.2803305685520172, + "rewards/reward_func/mean": 0.06000000610947609, + "rewards/reward_func/std": 0.3688979744911194, + "sampling/importance_sampling_ratio/max": 2.129168748855591, + "sampling/importance_sampling_ratio/mean": 0.9699528813362122, + "sampling/importance_sampling_ratio/min": 0.44781070947647095, + "sampling/sampling_logp_difference/max": 0.2890472412109375, + "sampling/sampling_logp_difference/mean": 0.022462455555796623, + "step": 121, + "step_time": 78.31796040100744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3762264847755432, + "epoch": 0.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0539501905441284, + "kl": 0.005846098996698856, + "learning_rate": 4.8922038901559225e-06, + "loss": 0.0026, + "num_tokens": 681091.0, + "reward": 0.2162500023841858, + "reward_std": 0.5134057998657227, + "rewards/reward_func/mean": 0.2162500023841858, + "rewards/reward_func/std": 0.47575318813323975, + "sampling/importance_sampling_ratio/max": 1.4815739393234253, + "sampling/importance_sampling_ratio/mean": 0.9299391508102417, + "sampling/importance_sampling_ratio/min": 0.5258536338806152, + "sampling/sampling_logp_difference/max": 0.30264854431152344, + "sampling/sampling_logp_difference/mean": 0.020659077912569046, + "step": 122, + "step_time": 82.64369376702234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35361769795417786, + "epoch": 0.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1598920822143555, + "kl": 0.003386750351637602, + "learning_rate": 4.889839376755041e-06, + "loss": 0.0283, + "num_tokens": 687305.0, + "reward": 0.3087500035762787, + "reward_std": 0.5727449655532837, + "rewards/reward_func/mean": 0.3087500035762787, + "rewards/reward_func/std": 0.5458528995513916, + "sampling/importance_sampling_ratio/max": 1.7175296545028687, + "sampling/importance_sampling_ratio/mean": 1.1679165363311768, + "sampling/importance_sampling_ratio/min": 0.6514824628829956, + "sampling/sampling_logp_difference/max": 0.47839367389678955, + "sampling/sampling_logp_difference/mean": 0.023937463760375977, + "step": 123, + "step_time": 90.77700008201646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 44.25, + "completions/mean_terminated_length": 44.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.35109275579452515, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0874840021133423, + "kl": 0.0036450112238526344, + "learning_rate": 4.887449795069948e-06, + "loss": 0.1575, + "num_tokens": 693333.0, + "reward": 0.4699999988079071, + "reward_std": 0.02523816004395485, + "rewards/reward_func/mean": 0.4699999988079071, + "rewards/reward_func/std": 0.5484002232551575, + "sampling/importance_sampling_ratio/max": 2.278353214263916, + "sampling/importance_sampling_ratio/mean": 1.207700490951538, + "sampling/importance_sampling_ratio/min": 0.5298588871955872, + "sampling/sampling_logp_difference/max": 0.27920615673065186, + "sampling/sampling_logp_difference/mean": 0.023498259484767914, + "step": 124, + "step_time": 39.61502477500471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.35430189967155457, + "epoch": 0.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6847572326660156, + "kl": 0.004919369705021381, + "learning_rate": 4.885035170166229e-06, + "loss": -0.2698, + "num_tokens": 698906.0, + "reward": 0.0949999988079071, + "reward_std": 0.2718702256679535, + "rewards/reward_func/mean": 0.0949999988079071, + "rewards/reward_func/std": 0.36629417538642883, + "sampling/importance_sampling_ratio/max": 1.845292329788208, + "sampling/importance_sampling_ratio/mean": 1.1654714345932007, + "sampling/importance_sampling_ratio/min": 0.6519782543182373, + "sampling/sampling_logp_difference/max": 0.4780464172363281, + "sampling/sampling_logp_difference/mean": 0.0236376766115427, + "step": 125, + "step_time": 84.81572797399713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.37540292739868164, + "epoch": 0.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1251049041748047, + "kl": 0.004228860605508089, + "learning_rate": 4.8825955273721524e-06, + "loss": -0.1821, + "num_tokens": 704535.0, + "reward": 0.19999998807907104, + "reward_std": 0.5351865291595459, + "rewards/reward_func/mean": 0.19999998807907104, + "rewards/reward_func/std": 0.49549400806427, + "sampling/importance_sampling_ratio/max": 1.2624843120574951, + "sampling/importance_sampling_ratio/mean": 0.850989818572998, + "sampling/importance_sampling_ratio/min": 0.30390864610671997, + "sampling/sampling_logp_difference/max": 0.3254203796386719, + "sampling/sampling_logp_difference/mean": 0.022064577788114548, + "step": 126, + "step_time": 76.2562780379958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.41910192370414734, + "epoch": 0.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.317619800567627, + "kl": 0.005742704961448908, + "learning_rate": 4.88013089227842e-06, + "loss": 0.0236, + "num_tokens": 709703.0, + "reward": 0.4424999952316284, + "reward_std": 0.5581981539726257, + "rewards/reward_func/mean": 0.4424999952316284, + "rewards/reward_func/std": 0.5799199342727661, + "sampling/importance_sampling_ratio/max": 1.348827838897705, + "sampling/importance_sampling_ratio/mean": 0.9275149703025818, + "sampling/importance_sampling_ratio/min": 0.33642151951789856, + "sampling/sampling_logp_difference/max": 0.3352065086364746, + "sampling/sampling_logp_difference/mean": 0.027083944529294968, + "step": 127, + "step_time": 63.868688657006714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3978528380393982, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7018318176269531, + "kl": 0.0047828396782279015, + "learning_rate": 4.8776412907378845e-06, + "loss": -0.1779, + "num_tokens": 715735.0, + "reward": 0.21000000834465027, + "reward_std": 0.5070174336433411, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.46940696239471436, + "sampling/importance_sampling_ratio/max": 1.5801129341125488, + "sampling/importance_sampling_ratio/mean": 0.823201060295105, + "sampling/importance_sampling_ratio/min": 0.35874059796333313, + "sampling/sampling_logp_difference/max": 0.6062784194946289, + "sampling/sampling_logp_difference/mean": 0.02410537376999855, + "step": 128, + "step_time": 83.61712833400816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 55.5, + "completions/mean_terminated_length": 55.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.36482003331184387, + "epoch": 0.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.880913496017456, + "kl": 0.005393403582274914, + "learning_rate": 4.87512674886529e-06, + "loss": 0.1742, + "num_tokens": 720958.0, + "reward": 0.20374999940395355, + "reward_std": 0.311365008354187, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.46601465344429016, + "sampling/importance_sampling_ratio/max": 2.4826807975769043, + "sampling/importance_sampling_ratio/mean": 1.167348861694336, + "sampling/importance_sampling_ratio/min": 0.26117852330207825, + "sampling/sampling_logp_difference/max": 0.8425577878952026, + "sampling/sampling_logp_difference/mean": 0.025742068886756897, + "step": 129, + "step_time": 64.07934993301751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.371246337890625, + "epoch": 0.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.614325761795044, + "kl": 0.0035083587281405926, + "learning_rate": 4.872587293036991e-06, + "loss": -0.2087, + "num_tokens": 727108.0, + "reward": 0.29375001788139343, + "reward_std": 0.562328040599823, + "rewards/reward_func/mean": 0.29375001788139343, + "rewards/reward_func/std": 0.537532389163971, + "sampling/importance_sampling_ratio/max": 1.6865801811218262, + "sampling/importance_sampling_ratio/mean": 1.0800793170928955, + "sampling/importance_sampling_ratio/min": 0.5898501873016357, + "sampling/sampling_logp_difference/max": 0.3827958106994629, + "sampling/sampling_logp_difference/mean": 0.024071460589766502, + "step": 130, + "step_time": 78.63163189700572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.32340848445892334, + "epoch": 0.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.304259181022644, + "kl": 0.004556507803499699, + "learning_rate": 4.870022949890676e-06, + "loss": 0.207, + "num_tokens": 733001.0, + "reward": 0.19374999403953552, + "reward_std": 0.535616397857666, + "rewards/reward_func/mean": 0.19374999403953552, + "rewards/reward_func/std": 0.495895653963089, + "sampling/importance_sampling_ratio/max": 2.3520002365112305, + "sampling/importance_sampling_ratio/mean": 1.1608736515045166, + "sampling/importance_sampling_ratio/min": 0.5538614988327026, + "sampling/sampling_logp_difference/max": 0.394594669342041, + "sampling/sampling_logp_difference/mean": 0.0221773199737072, + "step": 131, + "step_time": 92.43735384300817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3576907515525818, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0428054332733154, + "kl": 0.018777361139655113, + "learning_rate": 4.867433746325093e-06, + "loss": -0.0642, + "num_tokens": 739105.0, + "reward": 0.21875, + "reward_std": 0.5183683633804321, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.4804295003414154, + "sampling/importance_sampling_ratio/max": 1.5391448736190796, + "sampling/importance_sampling_ratio/mean": 0.9020660519599915, + "sampling/importance_sampling_ratio/min": 0.2766675651073456, + "sampling/sampling_logp_difference/max": 1.2109692096710205, + "sampling/sampling_logp_difference/mean": 0.033887773752212524, + "step": 132, + "step_time": 87.67151060700417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.33072763681411743, + "epoch": 0.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5945179462432861, + "kl": 0.0026382263749837875, + "learning_rate": 4.864819709499762e-06, + "loss": -0.0036, + "num_tokens": 744440.0, + "reward": 0.5874999761581421, + "reward_std": 0.5714925527572632, + "rewards/reward_func/mean": 0.5874999761581421, + "rewards/reward_func/std": 0.5562823414802551, + "sampling/importance_sampling_ratio/max": 2.234254837036133, + "sampling/importance_sampling_ratio/mean": 1.3247432708740234, + "sampling/importance_sampling_ratio/min": 0.6224689483642578, + "sampling/sampling_logp_difference/max": 0.3226501941680908, + "sampling/sampling_logp_difference/mean": 0.021056218072772026, + "step": 133, + "step_time": 55.756049841002095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3642032742500305, + "epoch": 0.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390766859054565, + "kl": 0.0025580194778740406, + "learning_rate": 4.862180866834691e-06, + "loss": -0.2011, + "num_tokens": 750197.0, + "reward": 0.3425000011920929, + "reward_std": 0.5499535799026489, + "rewards/reward_func/mean": 0.3425000011920929, + "rewards/reward_func/std": 0.5348631739616394, + "sampling/importance_sampling_ratio/max": 2.091257333755493, + "sampling/importance_sampling_ratio/mean": 0.9505029916763306, + "sampling/importance_sampling_ratio/min": 0.39569008350372314, + "sampling/sampling_logp_difference/max": 0.5128500461578369, + "sampling/sampling_logp_difference/mean": 0.022797472774982452, + "step": 134, + "step_time": 77.64818813299644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.35776978731155396, + "epoch": 0.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.332118034362793, + "kl": 0.0026144185103476048, + "learning_rate": 4.8595172460100914e-06, + "loss": -0.1461, + "num_tokens": 755246.0, + "reward": 0.20875000953674316, + "reward_std": 0.3141333758831024, + "rewards/reward_func/mean": 0.20875000953674316, + "rewards/reward_func/std": 0.4824472665786743, + "sampling/importance_sampling_ratio/max": 2.2898991107940674, + "sampling/importance_sampling_ratio/mean": 1.1793955564498901, + "sampling/importance_sampling_ratio/min": 0.3208160698413849, + "sampling/sampling_logp_difference/max": 0.32736682891845703, + "sampling/sampling_logp_difference/mean": 0.025175008922815323, + "step": 135, + "step_time": 94.36543551200884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.326175332069397, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.042453408241272, + "kl": 0.002404378727078438, + "learning_rate": 4.856828874966086e-06, + "loss": 0.0913, + "num_tokens": 761403.0, + "reward": 0.08624999970197678, + "reward_std": 0.24735195934772491, + "rewards/reward_func/mean": 0.08624999970197678, + "rewards/reward_func/std": 0.33019205927848816, + "sampling/importance_sampling_ratio/max": 1.598332405090332, + "sampling/importance_sampling_ratio/mean": 1.1521015167236328, + "sampling/importance_sampling_ratio/min": 0.8144843578338623, + "sampling/sampling_logp_difference/max": 0.30525922775268555, + "sampling/sampling_logp_difference/mean": 0.02139485627412796, + "step": 136, + "step_time": 89.18075294501614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.32590705156326294, + "epoch": 0.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6651734709739685, + "kl": 0.00382244773209095, + "learning_rate": 4.854115781902414e-06, + "loss": 0.0387, + "num_tokens": 767277.0, + "reward": 0.3462499976158142, + "reward_std": 0.5454127788543701, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5253825187683105, + "sampling/importance_sampling_ratio/max": 1.1137996912002563, + "sampling/importance_sampling_ratio/mean": 0.6871470808982849, + "sampling/importance_sampling_ratio/min": 0.2649921774864197, + "sampling/sampling_logp_difference/max": 0.43839168548583984, + "sampling/sampling_logp_difference/mean": 0.024401474744081497, + "step": 137, + "step_time": 63.591420451994054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3442898988723755, + "epoch": 0.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3569279909133911, + "kl": 0.006373442709445953, + "learning_rate": 4.851377995278138e-06, + "loss": 0.0322, + "num_tokens": 772862.0, + "reward": 0.34375, + "reward_std": 0.28818657994270325, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5449754595756531, + "sampling/importance_sampling_ratio/max": 2.1303188800811768, + "sampling/importance_sampling_ratio/mean": 1.062652587890625, + "sampling/importance_sampling_ratio/min": 0.4433048367500305, + "sampling/sampling_logp_difference/max": 0.5312175750732422, + "sampling/sampling_logp_difference/mean": 0.027195535600185394, + "step": 138, + "step_time": 79.88628011097899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.33581745624542236, + "epoch": 0.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8458130955696106, + "kl": 0.003910760395228863, + "learning_rate": 4.8486155438113455e-06, + "loss": 0.1521, + "num_tokens": 778552.0, + "reward": 0.07750000059604645, + "reward_std": 0.27364206314086914, + "rewards/reward_func/mean": 0.07750000059604645, + "rewards/reward_func/std": 0.3691205680370331, + "sampling/importance_sampling_ratio/max": 1.5847712755203247, + "sampling/importance_sampling_ratio/mean": 0.9813762307167053, + "sampling/importance_sampling_ratio/min": 0.5472065806388855, + "sampling/sampling_logp_difference/max": 0.3161931037902832, + "sampling/sampling_logp_difference/mean": 0.017320292070508003, + "step": 139, + "step_time": 76.26634333998663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.875, + "completions/mean_terminated_length": 45.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3359699845314026, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.083722710609436, + "kl": 0.06697973608970642, + "learning_rate": 4.845828456478843e-06, + "loss": 0.2268, + "num_tokens": 784051.0, + "reward": 0.2162499874830246, + "reward_std": 0.510195791721344, + "rewards/reward_func/mean": 0.2162499874830246, + "rewards/reward_func/std": 0.47307315468788147, + "sampling/importance_sampling_ratio/max": 1.3066641092300415, + "sampling/importance_sampling_ratio/mean": 0.8830969929695129, + "sampling/importance_sampling_ratio/min": 0.5595781207084656, + "sampling/sampling_logp_difference/max": 0.38854122161865234, + "sampling/sampling_logp_difference/mean": 0.026553025469183922, + "step": 140, + "step_time": 69.88314274000004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.31609684228897095, + "epoch": 0.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.061277389526367, + "kl": 0.013994975946843624, + "learning_rate": 4.84301676251586e-06, + "loss": 0.3487, + "num_tokens": 788975.0, + "reward": -0.07000000029802322, + "reward_std": 0.052655644714832306, + "rewards/reward_func/mean": -0.07000000029802322, + "rewards/reward_func/std": 0.051823876798152924, + "sampling/importance_sampling_ratio/max": 2.0056557655334473, + "sampling/importance_sampling_ratio/mean": 0.9070459008216858, + "sampling/importance_sampling_ratio/min": 0.40230387449264526, + "sampling/sampling_logp_difference/max": 0.49358463287353516, + "sampling/sampling_logp_difference/mean": 0.023013930767774582, + "step": 141, + "step_time": 86.36220617999788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.39303815364837646, + "epoch": 0.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3883771896362305, + "kl": 0.0059036314487457275, + "learning_rate": 4.840180491415733e-06, + "loss": 0.0691, + "num_tokens": 794046.0, + "reward": 0.1899999976158142, + "reward_std": 0.3205098509788513, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.48966461420059204, + "sampling/importance_sampling_ratio/max": 1.5588135719299316, + "sampling/importance_sampling_ratio/mean": 1.177175521850586, + "sampling/importance_sampling_ratio/min": 0.6510878801345825, + "sampling/sampling_logp_difference/max": 0.8279815912246704, + "sampling/sampling_logp_difference/mean": 0.02602643519639969, + "step": 142, + "step_time": 74.00148760300362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3808709979057312, + "epoch": 0.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9528871774673462, + "kl": 0.0036585263442248106, + "learning_rate": 4.837319672929606e-06, + "loss": 0.0533, + "num_tokens": 800535.0, + "reward": 0.023750003427267075, + "reward_std": 0.3143449127674103, + "rewards/reward_func/mean": 0.023750003427267075, + "rewards/reward_func/std": 0.3961578905582428, + "sampling/importance_sampling_ratio/max": 1.8512533903121948, + "sampling/importance_sampling_ratio/mean": 0.934387743473053, + "sampling/importance_sampling_ratio/min": 0.49656394124031067, + "sampling/sampling_logp_difference/max": 0.3407723903656006, + "sampling/sampling_logp_difference/mean": 0.02522529661655426, + "step": 143, + "step_time": 103.3366472539783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.33955660462379456, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3187702894210815, + "kl": 0.003852253081277013, + "learning_rate": 4.834434337066112e-06, + "loss": -0.0678, + "num_tokens": 806790.0, + "reward": 0.4737499952316284, + "reward_std": 0.6020057201385498, + "rewards/reward_func/mean": 0.4737499952316284, + "rewards/reward_func/std": 0.5573647022247314, + "sampling/importance_sampling_ratio/max": 1.97614586353302, + "sampling/importance_sampling_ratio/mean": 1.0368638038635254, + "sampling/importance_sampling_ratio/min": 0.6063994765281677, + "sampling/sampling_logp_difference/max": 0.3311631679534912, + "sampling/sampling_logp_difference/mean": 0.02065013162791729, + "step": 144, + "step_time": 65.69823711100616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.331376314163208, + "epoch": 0.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9537304043769836, + "kl": 0.007181019987910986, + "learning_rate": 4.831524514091056e-06, + "loss": -0.0085, + "num_tokens": 812242.0, + "reward": 0.17624999582767487, + "reward_std": 0.32842689752578735, + "rewards/reward_func/mean": 0.17624999582767487, + "rewards/reward_func/std": 0.47101524472236633, + "sampling/importance_sampling_ratio/max": 1.308451533317566, + "sampling/importance_sampling_ratio/mean": 0.9442053437232971, + "sampling/importance_sampling_ratio/min": 0.579285740852356, + "sampling/sampling_logp_difference/max": 0.34829843044281006, + "sampling/sampling_logp_difference/mean": 0.017970219254493713, + "step": 145, + "step_time": 79.94236772001022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.36558613181114197, + "epoch": 0.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.937701404094696, + "kl": 0.006811534054577351, + "learning_rate": 4.828590234527107e-06, + "loss": -0.0287, + "num_tokens": 817572.0, + "reward": 0.4725000262260437, + "reward_std": 0.48875167965888977, + "rewards/reward_func/mean": 0.4725000262260437, + "rewards/reward_func/std": 0.5325075387954712, + "sampling/importance_sampling_ratio/max": 1.5232332944869995, + "sampling/importance_sampling_ratio/mean": 0.8906862735748291, + "sampling/importance_sampling_ratio/min": 0.5748233199119568, + "sampling/sampling_logp_difference/max": 0.3986041247844696, + "sampling/sampling_logp_difference/mean": 0.022225454449653625, + "step": 146, + "step_time": 47.61015773200779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34926676750183105, + "epoch": 0.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4631636142730713, + "kl": 0.005530308000743389, + "learning_rate": 4.825631529153466e-06, + "loss": 0.0385, + "num_tokens": 823067.0, + "reward": -0.05374999716877937, + "reward_std": 0.0510866716504097, + "rewards/reward_func/mean": -0.05374999716877937, + "rewards/reward_func/std": 0.05655276030302048, + "sampling/importance_sampling_ratio/max": 1.7795029878616333, + "sampling/importance_sampling_ratio/mean": 1.1954052448272705, + "sampling/importance_sampling_ratio/min": 0.6682614088058472, + "sampling/sampling_logp_difference/max": 0.4499216079711914, + "sampling/sampling_logp_difference/mean": 0.021715868264436722, + "step": 147, + "step_time": 97.10051084400038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.34877437353134155, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.521934986114502, + "kl": 0.005301266442984343, + "learning_rate": 4.8226484290055544e-06, + "loss": 0.0887, + "num_tokens": 828804.0, + "reward": 0.48374998569488525, + "reward_std": 0.5961781144142151, + "rewards/reward_func/mean": 0.48374998569488525, + "rewards/reward_func/std": 0.5521112680435181, + "sampling/importance_sampling_ratio/max": 1.8339399099349976, + "sampling/importance_sampling_ratio/mean": 1.0161359310150146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7577025890350342, + "sampling/sampling_logp_difference/mean": 0.025348538532853127, + "step": 148, + "step_time": 61.746048774017254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3230116367340088, + "epoch": 0.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9116004705429077, + "kl": 0.011923927813768387, + "learning_rate": 4.8196409653746815e-06, + "loss": 0.0086, + "num_tokens": 834368.0, + "reward": 0.05874999612569809, + "reward_std": 0.28938817977905273, + "rewards/reward_func/mean": 0.05874999612569809, + "rewards/reward_func/std": 0.38327306509017944, + "sampling/importance_sampling_ratio/max": 1.5544939041137695, + "sampling/importance_sampling_ratio/mean": 0.9016103148460388, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.2695889472961426, + "sampling/sampling_logp_difference/mean": 0.02005579136312008, + "step": 149, + "step_time": 93.92776288898312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.31266871094703674, + "epoch": 0.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0015181303024292, + "kl": 0.003962170798331499, + "learning_rate": 4.8166091698077165e-06, + "loss": -0.0324, + "num_tokens": 839246.0, + "reward": 0.22374999523162842, + "reward_std": 0.5151989459991455, + "rewards/reward_func/mean": 0.22374999523162842, + "rewards/reward_func/std": 0.47782060503959656, + "sampling/importance_sampling_ratio/max": 1.158785343170166, + "sampling/importance_sampling_ratio/mean": 0.86640864610672, + "sampling/importance_sampling_ratio/min": 0.6415550112724304, + "sampling/sampling_logp_difference/max": 0.31314682960510254, + "sampling/sampling_logp_difference/mean": 0.018176782876253128, + "step": 150, + "step_time": 90.2421096219914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.28932589292526245, + "epoch": 0.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.364370584487915, + "kl": 0.00638082530349493, + "learning_rate": 4.813553074106761e-06, + "loss": -0.166, + "num_tokens": 844201.0, + "reward": 0.3137499988079071, + "reward_std": 0.5894033312797546, + "rewards/reward_func/mean": 0.3137499988079071, + "rewards/reward_func/std": 0.5638119578361511, + "sampling/importance_sampling_ratio/max": 2.0728578567504883, + "sampling/importance_sampling_ratio/mean": 1.2075482606887817, + "sampling/importance_sampling_ratio/min": 0.5291450023651123, + "sampling/sampling_logp_difference/max": 0.6335185766220093, + "sampling/sampling_logp_difference/mean": 0.022180885076522827, + "step": 151, + "step_time": 77.12891793900053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.39411360025405884, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.070342779159546, + "kl": 0.006427218206226826, + "learning_rate": 4.8104727103288125e-06, + "loss": 0.123, + "num_tokens": 850002.0, + "reward": -0.03125, + "reward_std": 0.026997683569788933, + "rewards/reward_func/mean": -0.03125, + "rewards/reward_func/std": 0.025319388136267662, + "sampling/importance_sampling_ratio/max": 1.8757935762405396, + "sampling/importance_sampling_ratio/mean": 1.1552492380142212, + "sampling/importance_sampling_ratio/min": 0.589501678943634, + "sampling/sampling_logp_difference/max": 0.4456930160522461, + "sampling/sampling_logp_difference/mean": 0.027546117082238197, + "step": 152, + "step_time": 90.55313208300504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.36300113797187805, + "epoch": 0.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0927797555923462, + "kl": 0.005236159078776836, + "learning_rate": 4.80736811078543e-06, + "loss": -0.1923, + "num_tokens": 855547.0, + "reward": 0.33000001311302185, + "reward_std": 0.560116171836853, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.538993775844574, + "sampling/importance_sampling_ratio/max": 1.9103295803070068, + "sampling/importance_sampling_ratio/mean": 1.0825953483581543, + "sampling/importance_sampling_ratio/min": 0.6134956479072571, + "sampling/sampling_logp_difference/max": 0.306821346282959, + "sampling/sampling_logp_difference/mean": 0.018912725150585175, + "step": 153, + "step_time": 79.64092588599306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3884934186935425, + "epoch": 0.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1354352235794067, + "kl": 0.00976630486547947, + "learning_rate": 4.804239308042392e-06, + "loss": 0.0839, + "num_tokens": 861032.0, + "reward": 0.4975000023841858, + "reward_std": 0.5802370309829712, + "rewards/reward_func/mean": 0.4975000023841858, + "rewards/reward_func/std": 0.5372084379196167, + "sampling/importance_sampling_ratio/max": 1.285419225692749, + "sampling/importance_sampling_ratio/mean": 0.8535559177398682, + "sampling/importance_sampling_ratio/min": 0.2762400805950165, + "sampling/sampling_logp_difference/max": 0.7990829944610596, + "sampling/sampling_logp_difference/mean": 0.027860336005687714, + "step": 154, + "step_time": 65.1173454009986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3430503308773041, + "epoch": 0.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.299552083015442, + "kl": 0.005469894502311945, + "learning_rate": 4.8010863349193605e-06, + "loss": -0.1091, + "num_tokens": 866546.0, + "reward": 0.46000000834465027, + "reward_std": 0.5039982199668884, + "rewards/reward_func/mean": 0.46000000834465027, + "rewards/reward_func/std": 0.5573149919509888, + "sampling/importance_sampling_ratio/max": 1.5915218591690063, + "sampling/importance_sampling_ratio/mean": 1.0774503946304321, + "sampling/importance_sampling_ratio/min": 0.6246634721755981, + "sampling/sampling_logp_difference/max": 0.32491254806518555, + "sampling/sampling_logp_difference/mean": 0.021029043942689896, + "step": 155, + "step_time": 62.39225424500182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3359929323196411, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.646360158920288, + "kl": 0.013532448559999466, + "learning_rate": 4.797909224489531e-06, + "loss": 0.0599, + "num_tokens": 872235.0, + "reward": 0.08875000476837158, + "reward_std": 0.27286672592163086, + "rewards/reward_func/mean": 0.08875000476837158, + "rewards/reward_func/std": 0.365764856338501, + "sampling/importance_sampling_ratio/max": 1.6614181995391846, + "sampling/importance_sampling_ratio/mean": 0.9551100730895996, + "sampling/importance_sampling_ratio/min": 0.45479723811149597, + "sampling/sampling_logp_difference/max": 0.41816186904907227, + "sampling/sampling_logp_difference/mean": 0.02349046617746353, + "step": 156, + "step_time": 86.28342253799201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.36201316118240356, + "epoch": 0.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2846462726593018, + "kl": 0.0050716521218419075, + "learning_rate": 4.794708010079288e-06, + "loss": 0.1633, + "num_tokens": 878136.0, + "reward": 0.3462499976158142, + "reward_std": 0.5590543746948242, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5334774851799011, + "sampling/importance_sampling_ratio/max": 1.8937947750091553, + "sampling/importance_sampling_ratio/mean": 1.087288498878479, + "sampling/importance_sampling_ratio/min": 0.5765236020088196, + "sampling/sampling_logp_difference/max": 0.5770103931427002, + "sampling/sampling_logp_difference/mean": 0.019394293427467346, + "step": 157, + "step_time": 77.44638174801366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.35876867175102234, + "epoch": 0.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6277668476104736, + "kl": 0.012355468235909939, + "learning_rate": 4.791482725267858e-06, + "loss": 0.0167, + "num_tokens": 883346.0, + "reward": 0.32499998807907104, + "reward_std": 0.5781960487365723, + "rewards/reward_func/mean": 0.32499998807907104, + "rewards/reward_func/std": 0.5492072105407715, + "sampling/importance_sampling_ratio/max": 1.5887843370437622, + "sampling/importance_sampling_ratio/mean": 0.9820950627326965, + "sampling/importance_sampling_ratio/min": 0.44207823276519775, + "sampling/sampling_logp_difference/max": 0.6413552761077881, + "sampling/sampling_logp_difference/mean": 0.023921802639961243, + "step": 158, + "step_time": 64.3381597440166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.32425469160079956, + "epoch": 0.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9400520324707031, + "kl": 0.004790422506630421, + "learning_rate": 4.78823340388695e-06, + "loss": -0.117, + "num_tokens": 889686.0, + "reward": 0.2187499850988388, + "reward_std": 0.31793859601020813, + "rewards/reward_func/mean": 0.2187499850988388, + "rewards/reward_func/std": 0.47139421105384827, + "sampling/importance_sampling_ratio/max": 1.150166392326355, + "sampling/importance_sampling_ratio/mean": 0.8458471894264221, + "sampling/importance_sampling_ratio/min": 0.521885335445404, + "sampling/sampling_logp_difference/max": 0.45719194412231445, + "sampling/sampling_logp_difference/mean": 0.020912881940603256, + "step": 159, + "step_time": 83.63672647799831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.362440288066864, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.179018259048462, + "kl": 0.009544030763208866, + "learning_rate": 4.7849600800204075e-06, + "loss": -0.0725, + "num_tokens": 895303.0, + "reward": 0.3062500059604645, + "reward_std": 0.5794415473937988, + "rewards/reward_func/mean": 0.3062500059604645, + "rewards/reward_func/std": 0.5538163185119629, + "sampling/importance_sampling_ratio/max": 1.3555833101272583, + "sampling/importance_sampling_ratio/mean": 0.8662522435188293, + "sampling/importance_sampling_ratio/min": 0.48955845832824707, + "sampling/sampling_logp_difference/max": 0.3127005100250244, + "sampling/sampling_logp_difference/mean": 0.02036750502884388, + "step": 160, + "step_time": 75.56624679401284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3294234275817871, + "epoch": 0.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9646108150482178, + "kl": 0.008349942974746227, + "learning_rate": 4.781662788003851e-06, + "loss": 0.04, + "num_tokens": 900212.0, + "reward": 0.3125, + "reward_std": 0.28340619802474976, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5329098105430603, + "sampling/importance_sampling_ratio/max": 1.3511476516723633, + "sampling/importance_sampling_ratio/mean": 0.9460296034812927, + "sampling/importance_sampling_ratio/min": 0.5577508211135864, + "sampling/sampling_logp_difference/max": 0.357053279876709, + "sampling/sampling_logp_difference/mean": 0.02139732614159584, + "step": 161, + "step_time": 63.430384036997566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.37909457087516785, + "epoch": 0.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1150704622268677, + "kl": 0.006739302072674036, + "learning_rate": 4.778341562424312e-06, + "loss": -0.0058, + "num_tokens": 905567.0, + "reward": 0.05624999478459358, + "reward_std": 0.30837467312812805, + "rewards/reward_func/mean": 0.05624999478459358, + "rewards/reward_func/std": 0.3843710124492645, + "sampling/importance_sampling_ratio/max": 1.5031989812850952, + "sampling/importance_sampling_ratio/mean": 0.9705907106399536, + "sampling/importance_sampling_ratio/min": 0.4351043999195099, + "sampling/sampling_logp_difference/max": 0.31818532943725586, + "sampling/sampling_logp_difference/mean": 0.023320209234952927, + "step": 162, + "step_time": 88.88530365700717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3668830394744873, + "epoch": 0.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6112715005874634, + "kl": 0.00733697135001421, + "learning_rate": 4.774996438119876e-06, + "loss": -0.2199, + "num_tokens": 910978.0, + "reward": 0.4387499988079071, + "reward_std": 0.6334177255630493, + "rewards/reward_func/mean": 0.4387499988079071, + "rewards/reward_func/std": 0.5865988731384277, + "sampling/importance_sampling_ratio/max": 1.7501353025436401, + "sampling/importance_sampling_ratio/mean": 1.1737459897994995, + "sampling/importance_sampling_ratio/min": 0.5407097935676575, + "sampling/sampling_logp_difference/max": 0.5975207090377808, + "sampling/sampling_logp_difference/mean": 0.023949533700942993, + "step": 163, + "step_time": 77.05458755500149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3550381660461426, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4822602272033691, + "kl": 0.018688620999455452, + "learning_rate": 4.771627450179315e-06, + "loss": 0.1415, + "num_tokens": 916968.0, + "reward": 0.07750000059604645, + "reward_std": 0.2801649868488312, + "rewards/reward_func/mean": 0.07750000059604645, + "rewards/reward_func/std": 0.36939141154289246, + "sampling/importance_sampling_ratio/max": 2.532761573791504, + "sampling/importance_sampling_ratio/mean": 1.3982677459716797, + "sampling/importance_sampling_ratio/min": 0.7517146468162537, + "sampling/sampling_logp_difference/max": 0.45490455627441406, + "sampling/sampling_logp_difference/mean": 0.020801950246095657, + "step": 164, + "step_time": 82.38432875502622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.31968969106674194, + "epoch": 0.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8617737293243408, + "kl": 0.013258876278996468, + "learning_rate": 4.768234633941716e-06, + "loss": -0.1274, + "num_tokens": 923326.0, + "reward": 0.3487499952316284, + "reward_std": 0.5436524152755737, + "rewards/reward_func/mean": 0.3487499952316284, + "rewards/reward_func/std": 0.5261297821998596, + "sampling/importance_sampling_ratio/max": 1.1405854225158691, + "sampling/importance_sampling_ratio/mean": 0.9033545255661011, + "sampling/importance_sampling_ratio/min": 0.7151353359222412, + "sampling/sampling_logp_difference/max": 0.31670236587524414, + "sampling/sampling_logp_difference/mean": 0.020617477595806122, + "step": 165, + "step_time": 91.76991003201692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34689170122146606, + "epoch": 0.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9533897638320923, + "kl": 0.005911373533308506, + "learning_rate": 4.764818024996117e-06, + "loss": 0.038, + "num_tokens": 929389.0, + "reward": 0.07500000298023224, + "reward_std": 0.27509522438049316, + "rewards/reward_func/mean": 0.07500000298023224, + "rewards/reward_func/std": 0.3702123165130615, + "sampling/importance_sampling_ratio/max": 1.1052998304367065, + "sampling/importance_sampling_ratio/mean": 0.8497065305709839, + "sampling/importance_sampling_ratio/min": 0.5713857412338257, + "sampling/sampling_logp_difference/max": 0.2978546619415283, + "sampling/sampling_logp_difference/mean": 0.019739100709557533, + "step": 166, + "step_time": 84.64203599700704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3531273603439331, + "epoch": 0.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5965367555618286, + "kl": 0.017820192500948906, + "learning_rate": 4.76137765918113e-06, + "loss": -0.1656, + "num_tokens": 934556.0, + "reward": 0.20249998569488525, + "reward_std": 0.5291311740875244, + "rewards/reward_func/mean": 0.20249998569488525, + "rewards/reward_func/std": 0.4898906648159027, + "sampling/importance_sampling_ratio/max": 1.6110693216323853, + "sampling/importance_sampling_ratio/mean": 1.1585469245910645, + "sampling/importance_sampling_ratio/min": 0.7089804410934448, + "sampling/sampling_logp_difference/max": 0.5340450406074524, + "sampling/sampling_logp_difference/mean": 0.02586180344223976, + "step": 167, + "step_time": 66.61828825098928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.31788909435272217, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9643736481666565, + "kl": 0.008403636515140533, + "learning_rate": 4.757913572584564e-06, + "loss": 0.017, + "num_tokens": 939873.0, + "reward": 0.20375001430511475, + "reward_std": 0.5219398736953735, + "rewards/reward_func/mean": 0.20375001430511475, + "rewards/reward_func/std": 0.4842354357242584, + "sampling/importance_sampling_ratio/max": 1.196733832359314, + "sampling/importance_sampling_ratio/mean": 0.8918753266334534, + "sampling/importance_sampling_ratio/min": 0.45229190587997437, + "sampling/sampling_logp_difference/max": 0.3446998596191406, + "sampling/sampling_logp_difference/mean": 0.01955568790435791, + "step": 168, + "step_time": 73.99562716198852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.34987837076187134, + "epoch": 0.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0800118446350098, + "kl": 0.006376350298523903, + "learning_rate": 4.754425801543047e-06, + "loss": 0.0112, + "num_tokens": 945824.0, + "reward": 0.2175000011920929, + "reward_std": 0.3063386380672455, + "rewards/reward_func/mean": 0.2175000011920929, + "rewards/reward_func/std": 0.477426141500473, + "sampling/importance_sampling_ratio/max": 1.8290735483169556, + "sampling/importance_sampling_ratio/mean": 1.1064127683639526, + "sampling/importance_sampling_ratio/min": 0.43567004799842834, + "sampling/sampling_logp_difference/max": 0.31137561798095703, + "sampling/sampling_logp_difference/mean": 0.020785929635167122, + "step": 169, + "step_time": 93.93666018798831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.35052281618118286, + "epoch": 0.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.340181589126587, + "kl": 0.0070198904722929, + "learning_rate": 4.750914382641647e-06, + "loss": -0.1266, + "num_tokens": 951240.0, + "reward": 0.3087499737739563, + "reward_std": 0.2839585244655609, + "rewards/reward_func/mean": 0.3087499737739563, + "rewards/reward_func/std": 0.5617685317993164, + "sampling/importance_sampling_ratio/max": 1.5249656438827515, + "sampling/importance_sampling_ratio/mean": 0.9443210363388062, + "sampling/importance_sampling_ratio/min": 0.6084120869636536, + "sampling/sampling_logp_difference/max": 0.3037455081939697, + "sampling/sampling_logp_difference/mean": 0.020245909690856934, + "step": 170, + "step_time": 85.42551145199104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 42.375, + "completions/mean_terminated_length": 42.375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3007114827632904, + "epoch": 0.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.134536623954773, + "kl": 0.011100707575678825, + "learning_rate": 4.747379352713489e-06, + "loss": -0.001, + "num_tokens": 956957.0, + "reward": 0.33124998211860657, + "reward_std": 0.2721617817878723, + "rewards/reward_func/mean": 0.33124998211860657, + "rewards/reward_func/std": 0.5298096537590027, + "sampling/importance_sampling_ratio/max": 1.7444802522659302, + "sampling/importance_sampling_ratio/mean": 1.0147829055786133, + "sampling/importance_sampling_ratio/min": 0.4858468472957611, + "sampling/sampling_logp_difference/max": 0.3548402786254883, + "sampling/sampling_logp_difference/mean": 0.024134717881679535, + "step": 171, + "step_time": 83.73731894500088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.36536312103271484, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.420339345932007, + "kl": 0.015561670064926147, + "learning_rate": 4.743820748839362e-06, + "loss": -0.1682, + "num_tokens": 962384.0, + "reward": 0.23499999940395355, + "reward_std": 0.30095145106315613, + "rewards/reward_func/mean": 0.23499999940395355, + "rewards/reward_func/std": 0.46632298827171326, + "sampling/importance_sampling_ratio/max": 2.9884486198425293, + "sampling/importance_sampling_ratio/mean": 1.253305435180664, + "sampling/importance_sampling_ratio/min": 0.40475034713745117, + "sampling/sampling_logp_difference/max": 0.4607217311859131, + "sampling/sampling_logp_difference/mean": 0.029603634029626846, + "step": 172, + "step_time": 79.53238872098154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.33054256439208984, + "epoch": 0.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2542983293533325, + "kl": 0.012076465412974358, + "learning_rate": 4.740238608347337e-06, + "loss": -0.0235, + "num_tokens": 968102.0, + "reward": 0.4775000214576721, + "reward_std": 0.5982934236526489, + "rewards/reward_func/mean": 0.4775000214576721, + "rewards/reward_func/std": 0.5544044971466064, + "sampling/importance_sampling_ratio/max": 1.2516276836395264, + "sampling/importance_sampling_ratio/mean": 1.0209238529205322, + "sampling/importance_sampling_ratio/min": 0.8097511529922485, + "sampling/sampling_logp_difference/max": 0.3150825500488281, + "sampling/sampling_logp_difference/mean": 0.022141385823488235, + "step": 173, + "step_time": 67.22070981800789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3209684491157532, + "epoch": 0.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5610387325286865, + "kl": 0.007439862936735153, + "learning_rate": 4.736632968812374e-06, + "loss": -0.0656, + "num_tokens": 973329.0, + "reward": 0.4699999988079071, + "reward_std": 0.612058162689209, + "rewards/reward_func/mean": 0.4699999988079071, + "rewards/reward_func/std": 0.5669718980789185, + "sampling/importance_sampling_ratio/max": 2.20025897026062, + "sampling/importance_sampling_ratio/mean": 1.2844336032867432, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6814525127410889, + "sampling/sampling_logp_difference/mean": 0.029089387506246567, + "step": 174, + "step_time": 59.10385779500939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3398328423500061, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8624789714813232, + "kl": 0.011860033497214317, + "learning_rate": 4.733003868055923e-06, + "loss": 0.1904, + "num_tokens": 979417.0, + "reward": 0.05624999478459358, + "reward_std": 0.265840083360672, + "rewards/reward_func/mean": 0.05624999478459358, + "rewards/reward_func/std": 0.3444223999977112, + "sampling/importance_sampling_ratio/max": 1.2150940895080566, + "sampling/importance_sampling_ratio/mean": 0.98213791847229, + "sampling/importance_sampling_ratio/min": 0.5763043165206909, + "sampling/sampling_logp_difference/max": 0.3341519832611084, + "sampling/sampling_logp_difference/mean": 0.020348751917481422, + "step": 175, + "step_time": 88.84378124101204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3720015287399292, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8829252123832703, + "kl": 0.013934159651398659, + "learning_rate": 4.729351344145536e-06, + "loss": -0.0327, + "num_tokens": 984863.0, + "reward": 0.05249999836087227, + "reward_std": 0.30060574412345886, + "rewards/reward_func/mean": 0.05249999836087227, + "rewards/reward_func/std": 0.38100433349609375, + "sampling/importance_sampling_ratio/max": 1.3568812608718872, + "sampling/importance_sampling_ratio/mean": 0.8758584260940552, + "sampling/importance_sampling_ratio/min": 0.5294094681739807, + "sampling/sampling_logp_difference/max": 0.36570286750793457, + "sampling/sampling_logp_difference/mean": 0.023136310279369354, + "step": 176, + "step_time": 95.73476834298344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 44.75, + "completions/mean_terminated_length": 44.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.36852994561195374, + "epoch": 0.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2951862812042236, + "kl": 0.021301502361893654, + "learning_rate": 4.725675435394461e-06, + "loss": 0.164, + "num_tokens": 990337.0, + "reward": 0.06875000894069672, + "reward_std": 0.2854534685611725, + "rewards/reward_func/mean": 0.06875000894069672, + "rewards/reward_func/std": 0.37745150923728943, + "sampling/importance_sampling_ratio/max": 2.195624589920044, + "sampling/importance_sampling_ratio/mean": 0.981530487537384, + "sampling/importance_sampling_ratio/min": 0.4619598090648651, + "sampling/sampling_logp_difference/max": 0.628758430480957, + "sampling/sampling_logp_difference/mean": 0.025460662320256233, + "step": 177, + "step_time": 67.39933587997803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3247171640396118, + "epoch": 0.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6371877193450928, + "kl": 0.012556849978864193, + "learning_rate": 4.721976180361239e-06, + "loss": 0.075, + "num_tokens": 995402.0, + "reward": 0.17749999463558197, + "reward_std": 0.3520262837409973, + "rewards/reward_func/mean": 0.17749999463558197, + "rewards/reward_func/std": 0.5018181204795837, + "sampling/importance_sampling_ratio/max": 1.499563455581665, + "sampling/importance_sampling_ratio/mean": 0.9744052290916443, + "sampling/importance_sampling_ratio/min": 0.5791205763816833, + "sampling/sampling_logp_difference/max": 0.4286665916442871, + "sampling/sampling_logp_difference/mean": 0.023523185402154922, + "step": 178, + "step_time": 59.054495546006365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3754112720489502, + "epoch": 0.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0154576301574707, + "kl": 0.013785503804683685, + "learning_rate": 4.718253617849306e-06, + "loss": 0.0381, + "num_tokens": 1001387.0, + "reward": 0.08749999105930328, + "reward_std": 0.2781248092651367, + "rewards/reward_func/mean": 0.08749999105930328, + "rewards/reward_func/std": 0.3700868785381317, + "sampling/importance_sampling_ratio/max": 1.0287582874298096, + "sampling/importance_sampling_ratio/mean": 0.8582373857498169, + "sampling/importance_sampling_ratio/min": 0.6108002066612244, + "sampling/sampling_logp_difference/max": 0.3374152183532715, + "sampling/sampling_logp_difference/mean": 0.023500245064496994, + "step": 179, + "step_time": 74.29987861201516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3148415982723236, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8665428757667542, + "kl": 0.014602867886424065, + "learning_rate": 4.7145077869065815e-06, + "loss": 0.2052, + "num_tokens": 1006871.0, + "reward": 0.20875000953674316, + "reward_std": 0.5284746885299683, + "rewards/reward_func/mean": 0.20875000953674316, + "rewards/reward_func/std": 0.48932716250419617, + "sampling/importance_sampling_ratio/max": 1.6064436435699463, + "sampling/importance_sampling_ratio/mean": 0.8494887948036194, + "sampling/importance_sampling_ratio/min": 0.28991734981536865, + "sampling/sampling_logp_difference/max": 0.5766005516052246, + "sampling/sampling_logp_difference/mean": 0.022570453584194183, + "step": 180, + "step_time": 75.10994843998924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 46.625, + "completions/mean_terminated_length": 46.625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35348600149154663, + "epoch": 0.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.372689962387085, + "kl": 0.009884382598102093, + "learning_rate": 4.710738726825059e-06, + "loss": 0.1381, + "num_tokens": 1012819.0, + "reward": 0.20874999463558197, + "reward_std": 0.5279327034950256, + "rewards/reward_func/mean": 0.20874999463558197, + "rewards/reward_func/std": 0.48888903856277466, + "sampling/importance_sampling_ratio/max": 1.244268774986267, + "sampling/importance_sampling_ratio/mean": 0.9364046454429626, + "sampling/importance_sampling_ratio/min": 0.6309903264045715, + "sampling/sampling_logp_difference/max": 0.3161327838897705, + "sampling/sampling_logp_difference/mean": 0.02107790857553482, + "step": 181, + "step_time": 80.6431828700006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3191605806350708, + "epoch": 0.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2262747287750244, + "kl": 0.011803516186773777, + "learning_rate": 4.706946477140396e-06, + "loss": 0.0117, + "num_tokens": 1017886.0, + "reward": 0.08250000327825546, + "reward_std": 0.2791511118412018, + "rewards/reward_func/mean": 0.08250000327825546, + "rewards/reward_func/std": 0.37247246503829956, + "sampling/importance_sampling_ratio/max": 1.2804160118103027, + "sampling/importance_sampling_ratio/mean": 0.7288067936897278, + "sampling/importance_sampling_ratio/min": 0.4809218645095825, + "sampling/sampling_logp_difference/max": 0.35615110397338867, + "sampling/sampling_logp_difference/mean": 0.023954380303621292, + "step": 182, + "step_time": 66.17639043199597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 41.375, + "completions/mean_terminated_length": 41.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.31017425656318665, + "epoch": 0.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0836074352264404, + "kl": 0.014769114553928375, + "learning_rate": 4.703131077631498e-06, + "loss": 0.1999, + "num_tokens": 1023314.0, + "reward": 0.3387500047683716, + "reward_std": 0.2747931182384491, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5456958413124084, + "sampling/importance_sampling_ratio/max": 2.228360414505005, + "sampling/importance_sampling_ratio/mean": 1.1713675260543823, + "sampling/importance_sampling_ratio/min": 0.5424574017524719, + "sampling/sampling_logp_difference/max": 0.5427889823913574, + "sampling/sampling_logp_difference/mean": 0.02546188049018383, + "step": 183, + "step_time": 82.83274039000389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 44.625, + "completions/mean_terminated_length": 44.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.35528260469436646, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2007426023483276, + "kl": 0.006659870967268944, + "learning_rate": 4.699292568320097e-06, + "loss": -0.0313, + "num_tokens": 1028524.0, + "reward": 0.32625001668930054, + "reward_std": 0.5633938312530518, + "rewards/reward_func/mean": 0.32625001668930054, + "rewards/reward_func/std": 0.5353753566741943, + "sampling/importance_sampling_ratio/max": 1.7591568231582642, + "sampling/importance_sampling_ratio/mean": 1.0408813953399658, + "sampling/importance_sampling_ratio/min": 0.6595721244812012, + "sampling/sampling_logp_difference/max": 0.7866129875183105, + "sampling/sampling_logp_difference/mean": 0.021467799320816994, + "step": 184, + "step_time": 79.17429194701253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.39394837617874146, + "epoch": 0.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.294677734375, + "kl": 0.015325892716646194, + "learning_rate": 4.6954309894703435e-06, + "loss": -0.0185, + "num_tokens": 1033728.0, + "reward": 0.14000000059604645, + "reward_std": 0.5303218364715576, + "rewards/reward_func/mean": 0.14000000059604645, + "rewards/reward_func/std": 0.4927183985710144, + "sampling/importance_sampling_ratio/max": 1.6721136569976807, + "sampling/importance_sampling_ratio/mean": 0.8524694442749023, + "sampling/importance_sampling_ratio/min": 0.3965020775794983, + "sampling/sampling_logp_difference/max": 0.49992823600769043, + "sampling/sampling_logp_difference/mean": 0.025052586570382118, + "step": 185, + "step_time": 88.38119602698134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.27076610922813416, + "epoch": 0.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7855340838432312, + "kl": 0.01319638080894947, + "learning_rate": 4.69154638158837e-06, + "loss": 0.0525, + "num_tokens": 1039269.0, + "reward": 0.3199999928474426, + "reward_std": 0.5744900107383728, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.5606883764266968, + "sampling/importance_sampling_ratio/max": 1.3193827867507935, + "sampling/importance_sampling_ratio/mean": 0.7310043573379517, + "sampling/importance_sampling_ratio/min": 0.36375343799591064, + "sampling/sampling_logp_difference/max": 0.707329273223877, + "sampling/sampling_logp_difference/mean": 0.02303919941186905, + "step": 186, + "step_time": 46.03551520599285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 44.25, + "completions/mean_terminated_length": 44.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3389705419540405, + "epoch": 0.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0259521007537842, + "kl": 0.0074524343945086, + "learning_rate": 4.687638785421875e-06, + "loss": 0.0077, + "num_tokens": 1046492.0, + "reward": 0.1899999976158142, + "reward_std": 0.31486421823501587, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.476385235786438, + "sampling/importance_sampling_ratio/max": 1.1441234350204468, + "sampling/importance_sampling_ratio/mean": 0.8540750741958618, + "sampling/importance_sampling_ratio/min": 0.4722847044467926, + "sampling/sampling_logp_difference/max": 0.43096935749053955, + "sampling/sampling_logp_difference/mean": 0.02116192877292633, + "step": 187, + "step_time": 108.60295571299503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3361474871635437, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7023673057556152, + "kl": 0.01211222168058157, + "learning_rate": 4.683708241959694e-06, + "loss": -0.4484, + "num_tokens": 1052225.0, + "reward": 0.20374999940395355, + "reward_std": 0.5282833576202393, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.4892541766166687, + "sampling/importance_sampling_ratio/max": 1.8190691471099854, + "sampling/importance_sampling_ratio/mean": 1.0389931201934814, + "sampling/importance_sampling_ratio/min": 0.39706769585609436, + "sampling/sampling_logp_difference/max": 0.3256983757019043, + "sampling/sampling_logp_difference/mean": 0.022885797545313835, + "step": 188, + "step_time": 71.87692314898595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3790978193283081, + "epoch": 0.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.685671091079712, + "kl": 0.02672746405005455, + "learning_rate": 4.679754792431368e-06, + "loss": -0.3355, + "num_tokens": 1057327.0, + "reward": 0.3125, + "reward_std": 0.592415452003479, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5662848949432373, + "sampling/importance_sampling_ratio/max": 2.115366220474243, + "sampling/importance_sampling_ratio/mean": 1.1775258779525757, + "sampling/importance_sampling_ratio/min": 0.5436846017837524, + "sampling/sampling_logp_difference/max": 0.46004533767700195, + "sampling/sampling_logp_difference/mean": 0.02344960719347, + "step": 189, + "step_time": 89.23043878999306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.34846365451812744, + "epoch": 0.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1112720966339111, + "kl": 0.01807713694870472, + "learning_rate": 4.675778478306712e-06, + "loss": 0.1345, + "num_tokens": 1062997.0, + "reward": 0.21125000715255737, + "reward_std": 0.5194716453552246, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.4811723232269287, + "sampling/importance_sampling_ratio/max": 1.4994481801986694, + "sampling/importance_sampling_ratio/mean": 1.0004935264587402, + "sampling/importance_sampling_ratio/min": 0.4650833010673523, + "sampling/sampling_logp_difference/max": 0.5518231391906738, + "sampling/sampling_logp_difference/mean": 0.02722608856856823, + "step": 190, + "step_time": 80.97714860000997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3735997676849365, + "epoch": 0.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2785558700561523, + "kl": 0.018139660358428955, + "learning_rate": 4.671779341295378e-06, + "loss": 0.1762, + "num_tokens": 1067953.0, + "reward": 0.20374999940395355, + "reward_std": 0.5309150815010071, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.49260058999061584, + "sampling/importance_sampling_ratio/max": 1.8519113063812256, + "sampling/importance_sampling_ratio/mean": 1.313336730003357, + "sampling/importance_sampling_ratio/min": 0.7735275626182556, + "sampling/sampling_logp_difference/max": 0.33982229232788086, + "sampling/sampling_logp_difference/mean": 0.028083689510822296, + "step": 191, + "step_time": 77.81035732399323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.3238619267940521, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0245407819747925, + "kl": 0.014340454712510109, + "learning_rate": 4.667757423346423e-06, + "loss": 0.0233, + "num_tokens": 1072876.0, + "reward": 0.3050000071525574, + "reward_std": 0.6045562028884888, + "rewards/reward_func/mean": 0.3050000071525574, + "rewards/reward_func/std": 0.574978232383728, + "sampling/importance_sampling_ratio/max": 1.4169648885726929, + "sampling/importance_sampling_ratio/mean": 0.9852313995361328, + "sampling/importance_sampling_ratio/min": 0.6186890602111816, + "sampling/sampling_logp_difference/max": 0.32411623001098633, + "sampling/sampling_logp_difference/mean": 0.021142879500985146, + "step": 192, + "step_time": 65.84021737999865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.335887610912323, + "epoch": 0.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4161148071289062, + "kl": 0.013850709423422813, + "learning_rate": 4.663712766647862e-06, + "loss": -0.0187, + "num_tokens": 1079270.0, + "reward": 0.17625001072883606, + "reward_std": 0.34232813119888306, + "rewards/reward_func/mean": 0.17625001072883606, + "rewards/reward_func/std": 0.49956947565078735, + "sampling/importance_sampling_ratio/max": 1.827757716178894, + "sampling/importance_sampling_ratio/mean": 1.0969743728637695, + "sampling/importance_sampling_ratio/min": 0.5536801815032959, + "sampling/sampling_logp_difference/max": 0.36859893798828125, + "sampling/sampling_logp_difference/mean": 0.023405691608786583, + "step": 193, + "step_time": 109.58445479700458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.31033533811569214, + "epoch": 0.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.027238368988037, + "kl": 0.016297120600938797, + "learning_rate": 4.65964541362623e-06, + "loss": -0.0868, + "num_tokens": 1084716.0, + "reward": 0.3387500047683716, + "reward_std": 0.5553901791572571, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5315056443214417, + "sampling/importance_sampling_ratio/max": 1.3392726182937622, + "sampling/importance_sampling_ratio/mean": 1.071367859840393, + "sampling/importance_sampling_ratio/min": 0.7315554022789001, + "sampling/sampling_logp_difference/max": 0.49803805351257324, + "sampling/sampling_logp_difference/mean": 0.02089758589863777, + "step": 194, + "step_time": 67.651659035997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 44.125, + "completions/mean_terminated_length": 44.125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.3812289834022522, + "epoch": 0.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7000164985656738, + "kl": 0.01980014517903328, + "learning_rate": 4.655555406946135e-06, + "loss": -0.1177, + "num_tokens": 1089906.0, + "reward": 0.32500001788139343, + "reward_std": 0.5569195747375488, + "rewards/reward_func/mean": 0.32500001788139343, + "rewards/reward_func/std": 0.5433756709098816, + "sampling/importance_sampling_ratio/max": 1.6265194416046143, + "sampling/importance_sampling_ratio/mean": 1.0881175994873047, + "sampling/importance_sampling_ratio/min": 0.6452130675315857, + "sampling/sampling_logp_difference/max": 0.3572232723236084, + "sampling/sampling_logp_difference/mean": 0.023923953995108604, + "step": 195, + "step_time": 64.9109322100121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.37043747305870056, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.687000036239624, + "kl": 0.022207628935575485, + "learning_rate": 4.651442789509813e-06, + "loss": 0.7241, + "num_tokens": 1095253.0, + "reward": -0.03999999910593033, + "reward_std": 0.04871772229671478, + "rewards/reward_func/mean": -0.03999999910593033, + "rewards/reward_func/std": 0.05182388052344322, + "sampling/importance_sampling_ratio/max": 2.730234146118164, + "sampling/importance_sampling_ratio/mean": 1.2118090391159058, + "sampling/importance_sampling_ratio/min": 0.32341212034225464, + "sampling/sampling_logp_difference/max": 0.3899533748626709, + "sampling/sampling_logp_difference/mean": 0.027441177517175674, + "step": 196, + "step_time": 88.97580981699866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.4157707095146179, + "epoch": 0.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3999090194702148, + "kl": 0.017077336087822914, + "learning_rate": 4.647307604456675e-06, + "loss": 0.1207, + "num_tokens": 1101561.0, + "reward": 0.07624999433755875, + "reward_std": 0.2700246274471283, + "rewards/reward_func/mean": 0.07624999433755875, + "rewards/reward_func/std": 0.35860592126846313, + "sampling/importance_sampling_ratio/max": 1.3688700199127197, + "sampling/importance_sampling_ratio/mean": 0.9039748907089233, + "sampling/importance_sampling_ratio/min": 0.5610687732696533, + "sampling/sampling_logp_difference/max": 0.30984562635421753, + "sampling/sampling_logp_difference/mean": 0.026080135256052017, + "step": 197, + "step_time": 79.29961144700064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35132038593292236, + "epoch": 0.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0895273685455322, + "kl": 0.014003671705722809, + "learning_rate": 4.643149895162854e-06, + "loss": -0.0698, + "num_tokens": 1106835.0, + "reward": 0.1850000023841858, + "reward_std": 0.3318837285041809, + "rewards/reward_func/mean": 0.1850000023841858, + "rewards/reward_func/std": 0.4803272783756256, + "sampling/importance_sampling_ratio/max": 1.1942050457000732, + "sampling/importance_sampling_ratio/mean": 0.8434613943099976, + "sampling/importance_sampling_ratio/min": 0.36741903424263, + "sampling/sampling_logp_difference/max": 0.5363888740539551, + "sampling/sampling_logp_difference/mean": 0.026127520948648453, + "step": 198, + "step_time": 85.95462637199671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.34718334674835205, + "epoch": 0.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.287834882736206, + "kl": 0.014416320249438286, + "learning_rate": 4.6389697052407535e-06, + "loss": -0.0184, + "num_tokens": 1112538.0, + "reward": 0.07625000178813934, + "reward_std": 0.27981066703796387, + "rewards/reward_func/mean": 0.07625000178813934, + "rewards/reward_func/std": 0.3634335398674011, + "sampling/importance_sampling_ratio/max": 1.8455092906951904, + "sampling/importance_sampling_ratio/mean": 1.0616990327835083, + "sampling/importance_sampling_ratio/min": 0.7087575197219849, + "sampling/sampling_logp_difference/max": 0.43187177181243896, + "sampling/sampling_logp_difference/mean": 0.02503993548452854, + "step": 199, + "step_time": 70.96149959298782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3079107701778412, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0146771669387817, + "kl": 0.013202982023358345, + "learning_rate": 4.634767078538589e-06, + "loss": -0.1111, + "num_tokens": 1118132.0, + "reward": 0.32249999046325684, + "reward_std": 0.550460934638977, + "rewards/reward_func/mean": 0.32249999046325684, + "rewards/reward_func/std": 0.5314871072769165, + "sampling/importance_sampling_ratio/max": 1.090162992477417, + "sampling/importance_sampling_ratio/mean": 0.7846779823303223, + "sampling/importance_sampling_ratio/min": 0.5243642330169678, + "sampling/sampling_logp_difference/max": 0.552169919013977, + "sampling/sampling_logp_difference/mean": 0.019852038472890854, + "step": 200, + "step_time": 77.93985611898825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 44.0, + "completions/mean_terminated_length": 44.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.38525718450546265, + "epoch": 0.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3107483386993408, + "kl": 0.020291147753596306, + "learning_rate": 4.630542059139923e-06, + "loss": 0.0876, + "num_tokens": 1123554.0, + "reward": 0.44999998807907104, + "reward_std": 0.5826581716537476, + "rewards/reward_func/mean": 0.44999998807907104, + "rewards/reward_func/std": 0.5397353768348694, + "sampling/importance_sampling_ratio/max": 1.5795011520385742, + "sampling/importance_sampling_ratio/mean": 0.9796627163887024, + "sampling/importance_sampling_ratio/min": 0.26825037598609924, + "sampling/sampling_logp_difference/max": 0.42403650283813477, + "sampling/sampling_logp_difference/mean": 0.02384255826473236, + "step": 201, + "step_time": 64.71107027700054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3214360177516937, + "epoch": 0.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9233896732330322, + "kl": 0.01243473682552576, + "learning_rate": 4.626294691363213e-06, + "loss": -0.006, + "num_tokens": 1129228.0, + "reward": 0.19374999403953552, + "reward_std": 0.5165963172912598, + "rewards/reward_func/mean": 0.19374999403953552, + "rewards/reward_func/std": 0.4797302186489105, + "sampling/importance_sampling_ratio/max": 1.7446887493133545, + "sampling/importance_sampling_ratio/mean": 1.0539482831954956, + "sampling/importance_sampling_ratio/min": 0.6484421491622925, + "sampling/sampling_logp_difference/max": 0.3515496253967285, + "sampling/sampling_logp_difference/mean": 0.02195613458752632, + "step": 202, + "step_time": 65.19753580997349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3714994192123413, + "epoch": 0.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8211506009101868, + "kl": 0.010113951750099659, + "learning_rate": 4.622025019761336e-06, + "loss": 0.0358, + "num_tokens": 1134606.0, + "reward": 0.19249999523162842, + "reward_std": 0.5170982480049133, + "rewards/reward_func/mean": 0.19249999523162842, + "rewards/reward_func/std": 0.47927772998809814, + "sampling/importance_sampling_ratio/max": 2.0278983116149902, + "sampling/importance_sampling_ratio/mean": 0.9206888675689697, + "sampling/importance_sampling_ratio/min": 0.5617751479148865, + "sampling/sampling_logp_difference/max": 0.347994327545166, + "sampling/sampling_logp_difference/mean": 0.02045644447207451, + "step": 203, + "step_time": 71.5089026770147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 43.25, + "completions/mean_terminated_length": 43.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.3702104687690735, + "epoch": 0.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3650530576705933, + "kl": 0.030135968700051308, + "learning_rate": 4.617733089121127e-06, + "loss": 0.2188, + "num_tokens": 1139666.0, + "reward": 0.07249999791383743, + "reward_std": 0.29401281476020813, + "rewards/reward_func/mean": 0.07249999791383743, + "rewards/reward_func/std": 0.37803059816360474, + "sampling/importance_sampling_ratio/max": 1.2655539512634277, + "sampling/importance_sampling_ratio/mean": 0.7001688480377197, + "sampling/importance_sampling_ratio/min": 0.36836326122283936, + "sampling/sampling_logp_difference/max": 0.5306470394134521, + "sampling/sampling_logp_difference/mean": 0.030222740024328232, + "step": 204, + "step_time": 76.82139863798511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3269600570201874, + "epoch": 0.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2307133674621582, + "kl": 0.010514447465538979, + "learning_rate": 4.613418944462907e-06, + "loss": 0.0782, + "num_tokens": 1145168.0, + "reward": 0.33249998092651367, + "reward_std": 0.5493869781494141, + "rewards/reward_func/mean": 0.33249998092651367, + "rewards/reward_func/std": 0.5238797664642334, + "sampling/importance_sampling_ratio/max": 1.906840443611145, + "sampling/importance_sampling_ratio/mean": 1.0142680406570435, + "sampling/importance_sampling_ratio/min": 0.42224809527397156, + "sampling/sampling_logp_difference/max": 0.7108626365661621, + "sampling/sampling_logp_difference/mean": 0.02854611724615097, + "step": 205, + "step_time": 78.07254538699635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.39792150259017944, + "epoch": 0.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3023905754089355, + "kl": 0.02957381308078766, + "learning_rate": 4.609082631040012e-06, + "loss": -0.0722, + "num_tokens": 1150370.0, + "reward": 0.3500000238418579, + "reward_std": 0.5520753264427185, + "rewards/reward_func/mean": 0.3500000238418579, + "rewards/reward_func/std": 0.5316282510757446, + "sampling/importance_sampling_ratio/max": 1.5235435962677002, + "sampling/importance_sampling_ratio/mean": 0.9376378655433655, + "sampling/importance_sampling_ratio/min": 0.33686015009880066, + "sampling/sampling_logp_difference/max": 0.342923641204834, + "sampling/sampling_logp_difference/mean": 0.027645057067275047, + "step": 206, + "step_time": 63.247660350985825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.35857129096984863, + "epoch": 0.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0902588367462158, + "kl": 0.009508270770311356, + "learning_rate": 4.604724194338318e-06, + "loss": 0.0542, + "num_tokens": 1155624.0, + "reward": 0.48500001430511475, + "reward_std": 0.5167855620384216, + "rewards/reward_func/mean": 0.48500001430511475, + "rewards/reward_func/std": 0.5455272793769836, + "sampling/importance_sampling_ratio/max": 2.0466551780700684, + "sampling/importance_sampling_ratio/mean": 1.1512298583984375, + "sampling/importance_sampling_ratio/min": 0.4752918779850006, + "sampling/sampling_logp_difference/max": 0.3554987907409668, + "sampling/sampling_logp_difference/mean": 0.02029740810394287, + "step": 207, + "step_time": 50.03297023801133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3821730315685272, + "epoch": 0.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3871904611587524, + "kl": 0.01627778261899948, + "learning_rate": 4.600343680075764e-06, + "loss": -0.1631, + "num_tokens": 1161217.0, + "reward": 0.5975000262260437, + "reward_std": 0.5603698492050171, + "rewards/reward_func/mean": 0.5975000262260437, + "rewards/reward_func/std": 0.5395169854164124, + "sampling/importance_sampling_ratio/max": 2.740676164627075, + "sampling/importance_sampling_ratio/mean": 1.3194831609725952, + "sampling/importance_sampling_ratio/min": 0.6237443685531616, + "sampling/sampling_logp_difference/max": 0.33385396003723145, + "sampling/sampling_logp_difference/mean": 0.023148780688643456, + "step": 208, + "step_time": 48.81324854400009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.4071102738380432, + "epoch": 0.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3003276586532593, + "kl": 0.015321669168770313, + "learning_rate": 4.5959411342018715e-06, + "loss": 0.0384, + "num_tokens": 1166266.0, + "reward": 0.21250000596046448, + "reward_std": 0.31011754274368286, + "rewards/reward_func/mean": 0.21250000596046448, + "rewards/reward_func/std": 0.48414137959480286, + "sampling/importance_sampling_ratio/max": 1.357431173324585, + "sampling/importance_sampling_ratio/mean": 0.9291549921035767, + "sampling/importance_sampling_ratio/min": 0.40400320291519165, + "sampling/sampling_logp_difference/max": 0.3345675468444824, + "sampling/sampling_logp_difference/mean": 0.02914167195558548, + "step": 209, + "step_time": 58.75232109500212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 44.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3340144157409668, + "epoch": 0.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2524479627609253, + "kl": 0.009892448782920837, + "learning_rate": 4.591516602897263e-06, + "loss": 0.0476, + "num_tokens": 1171977.0, + "reward": 0.20249998569488525, + "reward_std": 0.5303123593330383, + "rewards/reward_func/mean": 0.20249998569488525, + "rewards/reward_func/std": 0.491288423538208, + "sampling/importance_sampling_ratio/max": 1.9688581228256226, + "sampling/importance_sampling_ratio/mean": 0.9722031354904175, + "sampling/importance_sampling_ratio/min": 0.4560692310333252, + "sampling/sampling_logp_difference/max": 0.47432082891464233, + "sampling/sampling_logp_difference/mean": 0.024021849036216736, + "step": 210, + "step_time": 66.03959386100178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.36171913146972656, + "epoch": 0.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9004639387130737, + "kl": 0.01368255726993084, + "learning_rate": 4.587070132573178e-06, + "loss": -0.099, + "num_tokens": 1178226.0, + "reward": 0.3137499988079071, + "reward_std": 0.5728945732116699, + "rewards/reward_func/mean": 0.3137499988079071, + "rewards/reward_func/std": 0.5473296046257019, + "sampling/importance_sampling_ratio/max": 1.6000454425811768, + "sampling/importance_sampling_ratio/mean": 0.8045486211776733, + "sampling/importance_sampling_ratio/min": 0.18612989783287048, + "sampling/sampling_logp_difference/max": 0.4911985397338867, + "sampling/sampling_logp_difference/mean": 0.02342919073998928, + "step": 211, + "step_time": 82.29735374101438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 55.625, + "completions/mean_terminated_length": 55.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3647511601448059, + "epoch": 0.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4303269386291504, + "kl": 0.014668257907032967, + "learning_rate": 4.582601769870988e-06, + "loss": -0.0589, + "num_tokens": 1183454.0, + "reward": 0.05624999478459358, + "reward_std": 0.28012219071388245, + "rewards/reward_func/mean": 0.05624999478459358, + "rewards/reward_func/std": 0.36660364270210266, + "sampling/importance_sampling_ratio/max": 1.5942217111587524, + "sampling/importance_sampling_ratio/mean": 1.0475343465805054, + "sampling/importance_sampling_ratio/min": 0.5613923072814941, + "sampling/sampling_logp_difference/max": 0.30344557762145996, + "sampling/sampling_logp_difference/mean": 0.02498428151011467, + "step": 212, + "step_time": 81.58139562400174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.32731741666793823, + "epoch": 0.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9570350646972656, + "kl": 0.01391584612429142, + "learning_rate": 4.578111561661702e-06, + "loss": -0.0079, + "num_tokens": 1188684.0, + "reward": 0.20625001192092896, + "reward_std": 0.31164175271987915, + "rewards/reward_func/mean": 0.20625001192092896, + "rewards/reward_func/std": 0.4845598340034485, + "sampling/importance_sampling_ratio/max": 2.221400022506714, + "sampling/importance_sampling_ratio/mean": 1.2576444149017334, + "sampling/importance_sampling_ratio/min": 0.4654132127761841, + "sampling/sampling_logp_difference/max": 0.3340733051300049, + "sampling/sampling_logp_difference/mean": 0.02158265747129917, + "step": 213, + "step_time": 62.86962395600858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 44.875, + "completions/mean_terminated_length": 44.875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3720092177391052, + "epoch": 0.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0819950103759766, + "kl": 0.028701618313789368, + "learning_rate": 4.57359955504548e-06, + "loss": -0.2403, + "num_tokens": 1194175.0, + "reward": 0.059999994933605194, + "reward_std": 0.2900388836860657, + "rewards/reward_func/mean": 0.059999994933605194, + "rewards/reward_func/std": 0.3795486092567444, + "sampling/importance_sampling_ratio/max": 1.9808765649795532, + "sampling/importance_sampling_ratio/mean": 0.9989551305770874, + "sampling/importance_sampling_ratio/min": 0.3208101689815521, + "sampling/sampling_logp_difference/max": 0.43723440170288086, + "sampling/sampling_logp_difference/mean": 0.02771320939064026, + "step": 214, + "step_time": 80.43211394100217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 44.125, + "completions/mean_terminated_length": 44.125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.3983091115951538, + "epoch": 0.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234328508377075, + "kl": 0.017359508201479912, + "learning_rate": 4.569065797351135e-06, + "loss": 0.03, + "num_tokens": 1200200.0, + "reward": 0.07500000298023224, + "reward_std": 0.28411543369293213, + "rewards/reward_func/mean": 0.07500000298023224, + "rewards/reward_func/std": 0.3648875057697296, + "sampling/importance_sampling_ratio/max": 1.542760968208313, + "sampling/importance_sampling_ratio/mean": 0.9468032717704773, + "sampling/importance_sampling_ratio/min": 0.331230491399765, + "sampling/sampling_logp_difference/max": 0.34746503829956055, + "sampling/sampling_logp_difference/mean": 0.023643018677830696, + "step": 215, + "step_time": 68.54731415698188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 45.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.32628491520881653, + "epoch": 0.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.16749107837677, + "kl": 0.009836241602897644, + "learning_rate": 4.564510336135642e-06, + "loss": -0.1468, + "num_tokens": 1205836.0, + "reward": 0.3449999988079071, + "reward_std": 0.5644152164459229, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5431390404701233, + "sampling/importance_sampling_ratio/max": 1.6908862590789795, + "sampling/importance_sampling_ratio/mean": 1.001993179321289, + "sampling/importance_sampling_ratio/min": 0.4759381413459778, + "sampling/sampling_logp_difference/max": 0.45307183265686035, + "sampling/sampling_logp_difference/mean": 0.023161139339208603, + "step": 216, + "step_time": 64.80334895499982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.40457984805107117, + "epoch": 0.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.506805181503296, + "kl": 0.02346435934305191, + "learning_rate": 4.559933219183631e-06, + "loss": 0.1214, + "num_tokens": 1211436.0, + "reward": 0.07000000774860382, + "reward_std": 0.28503167629241943, + "rewards/reward_func/mean": 0.07000000774860382, + "rewards/reward_func/std": 0.36613819003105164, + "sampling/importance_sampling_ratio/max": 2.0334134101867676, + "sampling/importance_sampling_ratio/mean": 1.1956446170806885, + "sampling/importance_sampling_ratio/min": 0.4909520149230957, + "sampling/sampling_logp_difference/max": 0.3521122932434082, + "sampling/sampling_logp_difference/mean": 0.023371540009975433, + "step": 217, + "step_time": 77.5817707440001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3373414874076843, + "epoch": 0.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.612067461013794, + "kl": 0.02040504291653633, + "learning_rate": 4.555334494506895e-06, + "loss": 0.0114, + "num_tokens": 1216591.0, + "reward": 0.3100000023841858, + "reward_std": 0.2696232199668884, + "rewards/reward_func/mean": 0.3100000023841858, + "rewards/reward_func/std": 0.525221049785614, + "sampling/importance_sampling_ratio/max": 2.0312304496765137, + "sampling/importance_sampling_ratio/mean": 1.0194265842437744, + "sampling/importance_sampling_ratio/min": 0.5188978910446167, + "sampling/sampling_logp_difference/max": 0.5666763782501221, + "sampling/sampling_logp_difference/mean": 0.023367371410131454, + "step": 218, + "step_time": 83.6308395829983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 45.125, + "completions/mean_terminated_length": 45.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3169403672218323, + "epoch": 0.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9353013038635254, + "kl": 0.058528319001197815, + "learning_rate": 4.550714210343879e-06, + "loss": 0.0259, + "num_tokens": 1222212.0, + "reward": 0.45375001430511475, + "reward_std": 0.5997226238250732, + "rewards/reward_func/mean": 0.45375001430511475, + "rewards/reward_func/std": 0.5565438866615295, + "sampling/importance_sampling_ratio/max": 1.582319736480713, + "sampling/importance_sampling_ratio/mean": 0.8256447315216064, + "sampling/importance_sampling_ratio/min": 0.30875927209854126, + "sampling/sampling_logp_difference/max": 0.9188776016235352, + "sampling/sampling_logp_difference/mean": 0.023561663925647736, + "step": 219, + "step_time": 80.09332663999521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 44.625, + "completions/mean_terminated_length": 44.625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.36092275381088257, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6617423892021179, + "kl": 0.00997140072286129, + "learning_rate": 4.546072415159179e-06, + "loss": 0.145, + "num_tokens": 1227779.0, + "reward": 0.32749998569488525, + "reward_std": 0.578244149684906, + "rewards/reward_func/mean": 0.32749998569488525, + "rewards/reward_func/std": 0.5523391962051392, + "sampling/importance_sampling_ratio/max": 1.2815821170806885, + "sampling/importance_sampling_ratio/mean": 0.6444365978240967, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.1129623651504517, + "sampling/sampling_logp_difference/mean": 0.029664166271686554, + "step": 220, + "step_time": 67.70866433801712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 55.375, + "completions/mean_terminated_length": 55.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.38162463903427124, + "epoch": 0.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828568458557129, + "kl": 0.009829282760620117, + "learning_rate": 4.541409157643027e-06, + "loss": -0.0023, + "num_tokens": 1232951.0, + "reward": 0.20249998569488525, + "reward_std": 0.3496881127357483, + "rewards/reward_func/mean": 0.20249998569488525, + "rewards/reward_func/std": 0.48948225378990173, + "sampling/importance_sampling_ratio/max": 1.5951018333435059, + "sampling/importance_sampling_ratio/mean": 1.0153491497039795, + "sampling/importance_sampling_ratio/min": 0.5687949657440186, + "sampling/sampling_logp_difference/max": 0.5899345278739929, + "sampling/sampling_logp_difference/mean": 0.025079842656850815, + "step": 221, + "step_time": 59.01676659900113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.34739094972610474, + "epoch": 0.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8717793226242065, + "kl": 0.013192622922360897, + "learning_rate": 4.5367244867107905e-06, + "loss": 0.1667, + "num_tokens": 1238183.0, + "reward": 0.0612499974668026, + "reward_std": 0.28369978070259094, + "rewards/reward_func/mean": 0.0612499974668026, + "rewards/reward_func/std": 0.37635233998298645, + "sampling/importance_sampling_ratio/max": 1.736910343170166, + "sampling/importance_sampling_ratio/mean": 0.845312237739563, + "sampling/importance_sampling_ratio/min": 0.32411935925483704, + "sampling/sampling_logp_difference/max": 0.4334859848022461, + "sampling/sampling_logp_difference/mean": 0.02428375370800495, + "step": 222, + "step_time": 75.28331548400456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3791123628616333, + "epoch": 0.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7146968841552734, + "kl": 0.009486062452197075, + "learning_rate": 4.53201845150245e-06, + "loss": -0.0391, + "num_tokens": 1244209.0, + "reward": 0.1850000023841858, + "reward_std": 0.4920302927494049, + "rewards/reward_func/mean": 0.1850000023841858, + "rewards/reward_func/std": 0.45610150694847107, + "sampling/importance_sampling_ratio/max": 1.846500039100647, + "sampling/importance_sampling_ratio/mean": 1.1525156497955322, + "sampling/importance_sampling_ratio/min": 0.5845286250114441, + "sampling/sampling_logp_difference/max": 0.3337571620941162, + "sampling/sampling_logp_difference/mean": 0.023619763553142548, + "step": 223, + "step_time": 73.09225799201522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3343263268470764, + "epoch": 0.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0479278564453125, + "kl": 0.015914462506771088, + "learning_rate": 4.527291101382088e-06, + "loss": -0.0026, + "num_tokens": 1249545.0, + "reward": 0.45125001668930054, + "reward_std": 0.6346049904823303, + "rewards/reward_func/mean": 0.45125001668930054, + "rewards/reward_func/std": 0.5894412994384766, + "sampling/importance_sampling_ratio/max": 1.4512196779251099, + "sampling/importance_sampling_ratio/mean": 0.9548776149749756, + "sampling/importance_sampling_ratio/min": 0.34800758957862854, + "sampling/sampling_logp_difference/max": 0.516020655632019, + "sampling/sampling_logp_difference/mean": 0.019098889082670212, + "step": 224, + "step_time": 61.251870251988294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35256800055503845, + "epoch": 0.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4217804670333862, + "kl": 0.021977337077260017, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4031, + "num_tokens": 1255140.0, + "reward": -0.06624999642372131, + "reward_std": 0.04499492794275284, + "rewards/reward_func/mean": -0.06624999642372131, + "rewards/reward_func/std": 0.07818248122930527, + "sampling/importance_sampling_ratio/max": 1.9071402549743652, + "sampling/importance_sampling_ratio/mean": 0.9328581094741821, + "sampling/importance_sampling_ratio/min": 0.30502116680145264, + "sampling/sampling_logp_difference/max": 0.640667200088501, + "sampling/sampling_logp_difference/mean": 0.02563471347093582, + "step": 225, + "step_time": 75.17881314299302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35993432998657227, + "epoch": 0.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.419751524925232, + "kl": 0.011443949304521084, + "learning_rate": 4.517772654979024e-06, + "loss": -0.1155, + "num_tokens": 1261099.0, + "reward": 0.32375001907348633, + "reward_std": 0.5461503267288208, + "rewards/reward_func/mean": 0.32375001907348633, + "rewards/reward_func/std": 0.5280405282974243, + "sampling/importance_sampling_ratio/max": 1.3371970653533936, + "sampling/importance_sampling_ratio/mean": 0.980187714099884, + "sampling/importance_sampling_ratio/min": 0.6122799515724182, + "sampling/sampling_logp_difference/max": 0.3190453052520752, + "sampling/sampling_logp_difference/mean": 0.0227971188724041, + "step": 226, + "step_time": 74.47182274601073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.38980555534362793, + "epoch": 0.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9645390510559082, + "kl": 0.03210742026567459, + "learning_rate": 4.512981658540321e-06, + "loss": -0.2877, + "num_tokens": 1266504.0, + "reward": 0.32624998688697815, + "reward_std": 0.5668675899505615, + "rewards/reward_func/mean": 0.32624998688697815, + "rewards/reward_func/std": 0.5424794554710388, + "sampling/importance_sampling_ratio/max": 1.9740383625030518, + "sampling/importance_sampling_ratio/mean": 0.9537367224693298, + "sampling/importance_sampling_ratio/min": 0.35949572920799255, + "sampling/sampling_logp_difference/max": 0.7103188037872314, + "sampling/sampling_logp_difference/mean": 0.03307211026549339, + "step": 227, + "step_time": 57.815353090001736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 41.625, + "completions/mean_terminated_length": 41.625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.2987158000469208, + "epoch": 0.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.449369192123413, + "kl": 0.01764463633298874, + "learning_rate": 4.508169546876547e-06, + "loss": 0.1858, + "num_tokens": 1272180.0, + "reward": 0.19750000536441803, + "reward_std": 0.30442947149276733, + "rewards/reward_func/mean": 0.19750000536441803, + "rewards/reward_func/std": 0.4862612783908844, + "sampling/importance_sampling_ratio/max": 1.3735835552215576, + "sampling/importance_sampling_ratio/mean": 0.7681852579116821, + "sampling/importance_sampling_ratio/min": 0.3591448962688446, + "sampling/sampling_logp_difference/max": 0.4523334801197052, + "sampling/sampling_logp_difference/mean": 0.02623111382126808, + "step": 228, + "step_time": 62.75833543899353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 42.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3362084925174713, + "epoch": 0.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9343422651290894, + "kl": 0.01823657564818859, + "learning_rate": 4.503336370464476e-06, + "loss": -0.2018, + "num_tokens": 1277910.0, + "reward": 0.09624999761581421, + "reward_std": 0.27238762378692627, + "rewards/reward_func/mean": 0.09624999761581421, + "rewards/reward_func/std": 0.3667204976081848, + "sampling/importance_sampling_ratio/max": 2.1912035942077637, + "sampling/importance_sampling_ratio/mean": 1.1063485145568848, + "sampling/importance_sampling_ratio/min": 0.4857397675514221, + "sampling/sampling_logp_difference/max": 0.8033664226531982, + "sampling/sampling_logp_difference/mean": 0.022801101207733154, + "step": 229, + "step_time": 79.13566814499791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.37015005946159363, + "epoch": 0.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9363261461257935, + "kl": 0.019413193687796593, + "learning_rate": 4.49848218000184e-06, + "loss": -0.2319, + "num_tokens": 1284156.0, + "reward": 0.19249999523162842, + "reward_std": 0.33216795325279236, + "rewards/reward_func/mean": 0.19249999523162842, + "rewards/reward_func/std": 0.4885766804218292, + "sampling/importance_sampling_ratio/max": 2.352567195892334, + "sampling/importance_sampling_ratio/mean": 1.2104213237762451, + "sampling/importance_sampling_ratio/min": 0.4038701057434082, + "sampling/sampling_logp_difference/max": 0.38236117362976074, + "sampling/sampling_logp_difference/mean": 0.028649557381868362, + "step": 230, + "step_time": 72.49902504199417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3422005772590637, + "epoch": 0.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6006170511245728, + "kl": 0.025375576689839363, + "learning_rate": 4.493607026406802e-06, + "loss": 0.1486, + "num_tokens": 1289211.0, + "reward": 0.5887500047683716, + "reward_std": 0.5644031763076782, + "rewards/reward_func/mean": 0.5887500047683716, + "rewards/reward_func/std": 0.538023829460144, + "sampling/importance_sampling_ratio/max": 1.8998997211456299, + "sampling/importance_sampling_ratio/mean": 0.896106481552124, + "sampling/importance_sampling_ratio/min": 0.4045734107494354, + "sampling/sampling_logp_difference/max": 0.6570481061935425, + "sampling/sampling_logp_difference/mean": 0.02584882825613022, + "step": 231, + "step_time": 43.68991583000752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3895803689956665, + "epoch": 0.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.428395390510559, + "kl": 0.036273203790187836, + "learning_rate": 4.488710960817416e-06, + "loss": -0.0911, + "num_tokens": 1294840.0, + "reward": 0.3199999928474426, + "reward_std": 0.2673959732055664, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.515225350856781, + "sampling/importance_sampling_ratio/max": 2.268913984298706, + "sampling/importance_sampling_ratio/mean": 1.0703096389770508, + "sampling/importance_sampling_ratio/min": 0.59864342212677, + "sampling/sampling_logp_difference/max": 0.3564453125, + "sampling/sampling_logp_difference/mean": 0.026173098012804985, + "step": 232, + "step_time": 55.16194109900971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 42.375, + "completions/mean_terminated_length": 42.375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.3868124783039093, + "epoch": 0.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3841030597686768, + "kl": 0.05138152837753296, + "learning_rate": 4.483794034591092e-06, + "loss": -0.0381, + "num_tokens": 1299943.0, + "reward": 0.20875000953674316, + "reward_std": 0.3170267939567566, + "rewards/reward_func/mean": 0.20875000953674316, + "rewards/reward_func/std": 0.47408372163772583, + "sampling/importance_sampling_ratio/max": 1.1332757472991943, + "sampling/importance_sampling_ratio/mean": 0.8772479891777039, + "sampling/importance_sampling_ratio/min": 0.6085068583488464, + "sampling/sampling_logp_difference/max": 0.6103432178497314, + "sampling/sampling_logp_difference/mean": 0.03235594183206558, + "step": 233, + "step_time": 57.82354362300248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3302072584629059, + "epoch": 0.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921053051948547, + "kl": 0.01756385527551174, + "learning_rate": 4.4788562993040615e-06, + "loss": -0.009, + "num_tokens": 1305391.0, + "reward": 0.32124999165534973, + "reward_std": 0.570686936378479, + "rewards/reward_func/mean": 0.32124999165534973, + "rewards/reward_func/std": 0.5467158555984497, + "sampling/importance_sampling_ratio/max": 1.3551892042160034, + "sampling/importance_sampling_ratio/mean": 0.8291321992874146, + "sampling/importance_sampling_ratio/min": 0.32071855664253235, + "sampling/sampling_logp_difference/max": 0.42920511960983276, + "sampling/sampling_logp_difference/mean": 0.021804213523864746, + "step": 234, + "step_time": 75.09231540199835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 41.125, + "completions/mean_terminated_length": 41.125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.38166582584381104, + "epoch": 0.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6319152116775513, + "kl": 0.03030114620923996, + "learning_rate": 4.473897806750829e-06, + "loss": -0.0721, + "num_tokens": 1311091.0, + "reward": 0.05250000208616257, + "reward_std": 0.29743990302085876, + "rewards/reward_func/mean": 0.05250000208616257, + "rewards/reward_func/std": 0.3858848810195923, + "sampling/importance_sampling_ratio/max": 1.8289971351623535, + "sampling/importance_sampling_ratio/mean": 0.8988316059112549, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.950005054473877, + "sampling/sampling_logp_difference/mean": 0.03348758816719055, + "step": 235, + "step_time": 76.2468694190029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.35070234537124634, + "epoch": 0.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1173913478851318, + "kl": 0.03465205430984497, + "learning_rate": 4.4689186089436365e-06, + "loss": -0.0474, + "num_tokens": 1316336.0, + "reward": 0.2150000035762787, + "reward_std": 0.3214074671268463, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.485386461019516, + "sampling/importance_sampling_ratio/max": 1.4250974655151367, + "sampling/importance_sampling_ratio/mean": 0.7525547742843628, + "sampling/importance_sampling_ratio/min": 0.2883736193180084, + "sampling/sampling_logp_difference/max": 0.680816650390625, + "sampling/sampling_logp_difference/mean": 0.024951238185167313, + "step": 236, + "step_time": 45.73709376499755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34809672832489014, + "epoch": 0.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1273499727249146, + "kl": 0.026207346469163895, + "learning_rate": 4.463918758111912e-06, + "loss": 0.1471, + "num_tokens": 1322121.0, + "reward": -0.03500000014901161, + "reward_std": 0.028673537075519562, + "rewards/reward_func/mean": -0.03500000014901161, + "rewards/reward_func/std": 0.03999999910593033, + "sampling/importance_sampling_ratio/max": 1.5887080430984497, + "sampling/importance_sampling_ratio/mean": 0.9801490306854248, + "sampling/importance_sampling_ratio/min": 0.5287134647369385, + "sampling/sampling_logp_difference/max": 0.49480628967285156, + "sampling/sampling_logp_difference/mean": 0.025246813893318176, + "step": 237, + "step_time": 90.44008958002087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.3520908057689667, + "epoch": 0.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0922024250030518, + "kl": 0.031959474086761475, + "learning_rate": 4.4588983067017255e-06, + "loss": 0.1783, + "num_tokens": 1328212.0, + "reward": 0.2212499976158142, + "reward_std": 0.5126502513885498, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.4746558964252472, + "sampling/importance_sampling_ratio/max": 1.6246837377548218, + "sampling/importance_sampling_ratio/mean": 0.8979704976081848, + "sampling/importance_sampling_ratio/min": 0.3638645112514496, + "sampling/sampling_logp_difference/max": 0.4873514175415039, + "sampling/sampling_logp_difference/mean": 0.028458524495363235, + "step": 238, + "step_time": 71.68060715598403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3871595859527588, + "epoch": 0.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2288389205932617, + "kl": 0.01771704852581024, + "learning_rate": 4.4538573073752365e-06, + "loss": 0.0403, + "num_tokens": 1333316.0, + "reward": 0.054999999701976776, + "reward_std": 0.29602476954460144, + "rewards/reward_func/mean": 0.054999999701976776, + "rewards/reward_func/std": 0.36570870876312256, + "sampling/importance_sampling_ratio/max": 1.5434668064117432, + "sampling/importance_sampling_ratio/mean": 0.9942148923873901, + "sampling/importance_sampling_ratio/min": 0.48395439982414246, + "sampling/sampling_logp_difference/max": 0.6915938854217529, + "sampling/sampling_logp_difference/mean": 0.035692013800144196, + "step": 239, + "step_time": 70.68868763197679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.31809288263320923, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0677132606506348, + "kl": 0.011697100475430489, + "learning_rate": 4.448795813010142e-06, + "loss": -0.1253, + "num_tokens": 1338733.0, + "reward": 0.36125001311302185, + "reward_std": 0.5394142866134644, + "rewards/reward_func/mean": 0.36125001311302185, + "rewards/reward_func/std": 0.5180302262306213, + "sampling/importance_sampling_ratio/max": 1.8030056953430176, + "sampling/importance_sampling_ratio/mean": 1.1689889430999756, + "sampling/importance_sampling_ratio/min": 0.7900420427322388, + "sampling/sampling_logp_difference/max": 0.3128845691680908, + "sampling/sampling_logp_difference/mean": 0.021864818409085274, + "step": 240, + "step_time": 66.84894346201327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.29514139890670776, + "epoch": 0.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0881181955337524, + "kl": 0.02634306624531746, + "learning_rate": 4.443713876699124e-06, + "loss": -0.0806, + "num_tokens": 1344418.0, + "reward": 0.07999999821186066, + "reward_std": 0.25690117478370667, + "rewards/reward_func/mean": 0.07999999821186066, + "rewards/reward_func/std": 0.33342379331588745, + "sampling/importance_sampling_ratio/max": 1.363787055015564, + "sampling/importance_sampling_ratio/mean": 0.8313639163970947, + "sampling/importance_sampling_ratio/min": 0.3071029484272003, + "sampling/sampling_logp_difference/max": 0.6727430820465088, + "sampling/sampling_logp_difference/mean": 0.02636832371354103, + "step": 241, + "step_time": 81.16167590999976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.36427628993988037, + "epoch": 0.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.100838541984558, + "kl": 0.041223861277103424, + "learning_rate": 4.438611551749288e-06, + "loss": -0.2757, + "num_tokens": 1350588.0, + "reward": 0.5874999761581421, + "reward_std": 0.5468531847000122, + "rewards/reward_func/mean": 0.5874999761581421, + "rewards/reward_func/std": 0.5267623662948608, + "sampling/importance_sampling_ratio/max": 2.2273752689361572, + "sampling/importance_sampling_ratio/mean": 1.0134867429733276, + "sampling/importance_sampling_ratio/min": 0.3902888894081116, + "sampling/sampling_logp_difference/max": 0.5989378690719604, + "sampling/sampling_logp_difference/mean": 0.02611200511455536, + "step": 242, + "step_time": 67.87047297498793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 44.375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35488879680633545, + "epoch": 0.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.331992268562317, + "kl": 0.044845033437013626, + "learning_rate": 4.4334888916816096e-06, + "loss": -0.0932, + "num_tokens": 1355918.0, + "reward": 0.20125000178813934, + "reward_std": 0.5272895097732544, + "rewards/reward_func/mean": 0.20125000178813934, + "rewards/reward_func/std": 0.4884212911128998, + "sampling/importance_sampling_ratio/max": 1.0778487920761108, + "sampling/importance_sampling_ratio/mean": 0.9069632887840271, + "sampling/importance_sampling_ratio/min": 0.3717224597930908, + "sampling/sampling_logp_difference/max": 0.729764461517334, + "sampling/sampling_logp_difference/mean": 0.026907198131084442, + "step": 243, + "step_time": 67.90137365201372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 44.125, + "completions/mean_terminated_length": 44.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3981369733810425, + "epoch": 0.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1184395551681519, + "kl": 0.015510768629610538, + "learning_rate": 4.42834595023037e-06, + "loss": -0.0087, + "num_tokens": 1360516.0, + "reward": 0.5824999809265137, + "reward_std": 0.5755907893180847, + "rewards/reward_func/mean": 0.5824999809265137, + "rewards/reward_func/std": 0.5471158027648926, + "sampling/importance_sampling_ratio/max": 1.1929256916046143, + "sampling/importance_sampling_ratio/mean": 0.7005432844161987, + "sampling/importance_sampling_ratio/min": 0.47305941581726074, + "sampling/sampling_logp_difference/max": 0.354036808013916, + "sampling/sampling_logp_difference/mean": 0.024651892483234406, + "step": 244, + "step_time": 61.913708773994585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.31958675384521484, + "epoch": 0.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1137831211090088, + "kl": 0.024688197299838066, + "learning_rate": 4.423182781342589e-06, + "loss": -0.0889, + "num_tokens": 1365727.0, + "reward": 0.32625001668930054, + "reward_std": 0.5717824697494507, + "rewards/reward_func/mean": 0.32625001668930054, + "rewards/reward_func/std": 0.5434266328811646, + "sampling/importance_sampling_ratio/max": 1.4841914176940918, + "sampling/importance_sampling_ratio/mean": 0.8029133081436157, + "sampling/importance_sampling_ratio/min": 0.39716988801956177, + "sampling/sampling_logp_difference/max": 0.5414783358573914, + "sampling/sampling_logp_difference/mean": 0.02428363636136055, + "step": 245, + "step_time": 68.99983911900199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.33968907594680786, + "epoch": 0.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2323132753372192, + "kl": 0.01886364072561264, + "learning_rate": 4.417999439177465e-06, + "loss": 0.2441, + "num_tokens": 1371605.0, + "reward": 0.08750000596046448, + "reward_std": 0.2773665487766266, + "rewards/reward_func/mean": 0.08750000596046448, + "rewards/reward_func/std": 0.36577707529067993, + "sampling/importance_sampling_ratio/max": 1.7966817617416382, + "sampling/importance_sampling_ratio/mean": 0.9920728802680969, + "sampling/importance_sampling_ratio/min": 0.46240681409835815, + "sampling/sampling_logp_difference/max": 0.3581950068473816, + "sampling/sampling_logp_difference/mean": 0.02110222354531288, + "step": 246, + "step_time": 86.06116112999734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 45.0, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3882921040058136, + "epoch": 0.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0457669496536255, + "kl": 0.0466264933347702, + "learning_rate": 4.412795978105807e-06, + "loss": 0.0479, + "num_tokens": 1377108.0, + "reward": 0.08125000447034836, + "reward_std": 0.290319561958313, + "rewards/reward_func/mean": 0.08125000447034836, + "rewards/reward_func/std": 0.3741442859172821, + "sampling/importance_sampling_ratio/max": 1.3163291215896606, + "sampling/importance_sampling_ratio/mean": 0.8274441957473755, + "sampling/importance_sampling_ratio/min": 0.5468934178352356, + "sampling/sampling_logp_difference/max": 0.3266195058822632, + "sampling/sampling_logp_difference/mean": 0.023234577849507332, + "step": 247, + "step_time": 69.2718325239839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3644499182701111, + "epoch": 0.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8662134408950806, + "kl": 0.028522610664367676, + "learning_rate": 4.407572452709459e-06, + "loss": -0.1758, + "num_tokens": 1382458.0, + "reward": 0.3387500047683716, + "reward_std": 0.2905788719654083, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.550206184387207, + "sampling/importance_sampling_ratio/max": 1.9892619848251343, + "sampling/importance_sampling_ratio/mean": 0.994696319103241, + "sampling/importance_sampling_ratio/min": 0.32547527551651, + "sampling/sampling_logp_difference/max": 0.5737671852111816, + "sampling/sampling_logp_difference/mean": 0.029661521315574646, + "step": 248, + "step_time": 55.92289188998984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.340048223733902, + "epoch": 0.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2371996641159058, + "kl": 0.014844512566924095, + "learning_rate": 4.402328917780728e-06, + "loss": 0.1829, + "num_tokens": 1387909.0, + "reward": 0.32249999046325684, + "reward_std": 0.5570697784423828, + "rewards/reward_func/mean": 0.32249999046325684, + "rewards/reward_func/std": 0.531970739364624, + "sampling/importance_sampling_ratio/max": 1.771705985069275, + "sampling/importance_sampling_ratio/mean": 1.064762830734253, + "sampling/importance_sampling_ratio/min": 0.49545010924339294, + "sampling/sampling_logp_difference/max": 0.531287670135498, + "sampling/sampling_logp_difference/mean": 0.022021599113941193, + "step": 249, + "step_time": 72.32509338197997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.125, + "completions/mean_terminated_length": 53.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3873959183692932, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9252265691757202, + "kl": 0.01229300070554018, + "learning_rate": 4.397065428321818e-06, + "loss": 0.0921, + "num_tokens": 1393363.0, + "reward": 0.34375, + "reward_std": 0.5575473308563232, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5310620069503784, + "sampling/importance_sampling_ratio/max": 1.7814934253692627, + "sampling/importance_sampling_ratio/mean": 1.0325000286102295, + "sampling/importance_sampling_ratio/min": 0.5736287832260132, + "sampling/sampling_logp_difference/max": 0.45818281173706055, + "sampling/sampling_logp_difference/mean": 0.026250842958688736, + "step": 250, + "step_time": 90.09270300000207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3573990762233734, + "epoch": 0.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9304350018501282, + "kl": 0.01920994371175766, + "learning_rate": 4.391782039544239e-06, + "loss": 0.1493, + "num_tokens": 1399112.0, + "reward": 0.3375000059604645, + "reward_std": 0.5359517931938171, + "rewards/reward_func/mean": 0.3375000059604645, + "rewards/reward_func/std": 0.5219400525093079, + "sampling/importance_sampling_ratio/max": 1.4142625331878662, + "sampling/importance_sampling_ratio/mean": 0.8782503604888916, + "sampling/importance_sampling_ratio/min": 0.3311513364315033, + "sampling/sampling_logp_difference/max": 0.5579543113708496, + "sampling/sampling_logp_difference/mean": 0.024326374754309654, + "step": 251, + "step_time": 80.91022923600394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3160034418106079, + "epoch": 0.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0592174530029297, + "kl": 0.01498313620686531, + "learning_rate": 4.386478806868242e-06, + "loss": 0.2131, + "num_tokens": 1404423.0, + "reward": 0.19875001907348633, + "reward_std": 0.3156750202178955, + "rewards/reward_func/mean": 0.19875001907348633, + "rewards/reward_func/std": 0.4896481931209564, + "sampling/importance_sampling_ratio/max": 2.2778773307800293, + "sampling/importance_sampling_ratio/mean": 1.1893842220306396, + "sampling/importance_sampling_ratio/min": 0.4248703420162201, + "sampling/sampling_logp_difference/max": 0.31923460960388184, + "sampling/sampling_logp_difference/mean": 0.021706879138946533, + "step": 252, + "step_time": 76.23892499000067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3231382369995117, + "epoch": 0.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1245347261428833, + "kl": 0.02181754633784294, + "learning_rate": 4.381155785922226e-06, + "loss": 0.1193, + "num_tokens": 1409836.0, + "reward": 0.30375000834465027, + "reward_std": 0.5851833820343018, + "rewards/reward_func/mean": 0.30375000834465027, + "rewards/reward_func/std": 0.56360924243927, + "sampling/importance_sampling_ratio/max": 2.5759124755859375, + "sampling/importance_sampling_ratio/mean": 1.0727500915527344, + "sampling/importance_sampling_ratio/min": 0.623710036277771, + "sampling/sampling_logp_difference/max": 0.6664900779724121, + "sampling/sampling_logp_difference/mean": 0.023033898323774338, + "step": 253, + "step_time": 63.543558523000684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3466748595237732, + "epoch": 0.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.142482042312622, + "kl": 0.0223647840321064, + "learning_rate": 4.375813032542164e-06, + "loss": -0.0771, + "num_tokens": 1415411.0, + "reward": 0.21000000834465027, + "reward_std": 0.3328624665737152, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.48594531416893005, + "sampling/importance_sampling_ratio/max": 2.0455007553100586, + "sampling/importance_sampling_ratio/mean": 1.0768548250198364, + "sampling/importance_sampling_ratio/min": 0.49030447006225586, + "sampling/sampling_logp_difference/max": 0.5383121967315674, + "sampling/sampling_logp_difference/mean": 0.03029092587530613, + "step": 254, + "step_time": 89.68648639999446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3746393322944641, + "epoch": 0.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6275901794433594, + "kl": 0.032982099801301956, + "learning_rate": 4.37045060277101e-06, + "loss": -0.3207, + "num_tokens": 1420829.0, + "reward": 0.07375000417232513, + "reward_std": 0.2716521620750427, + "rewards/reward_func/mean": 0.07375000417232513, + "rewards/reward_func/std": 0.3556457757949829, + "sampling/importance_sampling_ratio/max": 2.175576686859131, + "sampling/importance_sampling_ratio/mean": 1.019911527633667, + "sampling/importance_sampling_ratio/min": 0.40404126048088074, + "sampling/sampling_logp_difference/max": 0.3250246047973633, + "sampling/sampling_logp_difference/mean": 0.024091674014925957, + "step": 255, + "step_time": 82.5458397520124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 43.75, + "completions/mean_terminated_length": 43.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.3585067689418793, + "epoch": 0.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.829698383808136, + "kl": 0.021870668977499008, + "learning_rate": 4.365068552858116e-06, + "loss": 0.0817, + "num_tokens": 1426845.0, + "reward": 0.1887499988079071, + "reward_std": 0.5287714004516602, + "rewards/reward_func/mean": 0.1887499988079071, + "rewards/reward_func/std": 0.4895898401737213, + "sampling/importance_sampling_ratio/max": 1.7392264604568481, + "sampling/importance_sampling_ratio/mean": 0.691516637802124, + "sampling/importance_sampling_ratio/min": 0.22693133354187012, + "sampling/sampling_logp_difference/max": 0.8031024932861328, + "sampling/sampling_logp_difference/mean": 0.028423123061656952, + "step": 256, + "step_time": 77.55902498602518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3533346652984619, + "epoch": 0.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0792738199234009, + "kl": 0.01500007789582014, + "learning_rate": 4.359666939258637e-06, + "loss": -0.1518, + "num_tokens": 1432532.0, + "reward": 0.07000000774860382, + "reward_std": 0.2898591160774231, + "rewards/reward_func/mean": 0.07000000774860382, + "rewards/reward_func/std": 0.38045087456703186, + "sampling/importance_sampling_ratio/max": 1.7748743295669556, + "sampling/importance_sampling_ratio/mean": 1.000688910484314, + "sampling/importance_sampling_ratio/min": 0.3758687973022461, + "sampling/sampling_logp_difference/max": 0.6241648197174072, + "sampling/sampling_logp_difference/mean": 0.028312578797340393, + "step": 257, + "step_time": 73.71465296699898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.36250796914100647, + "epoch": 0.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3215895891189575, + "kl": 0.020508520305156708, + "learning_rate": 4.354245818632944e-06, + "loss": -0.2258, + "num_tokens": 1438131.0, + "reward": -0.0949999988079071, + "reward_std": 0.0752500668168068, + "rewards/reward_func/mean": -0.0949999988079071, + "rewards/reward_func/std": 0.07559289783239365, + "sampling/importance_sampling_ratio/max": 2.191699743270874, + "sampling/importance_sampling_ratio/mean": 1.1482765674591064, + "sampling/importance_sampling_ratio/min": 0.5960127115249634, + "sampling/sampling_logp_difference/max": 0.35130882263183594, + "sampling/sampling_logp_difference/mean": 0.02419322356581688, + "step": 258, + "step_time": 78.94406015900313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3972855806350708, + "epoch": 0.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4929661750793457, + "kl": 0.021859250962734222, + "learning_rate": 4.348805247846027e-06, + "loss": 0.1502, + "num_tokens": 1444118.0, + "reward": 0.2149999886751175, + "reward_std": 0.31619954109191895, + "rewards/reward_func/mean": 0.2149999886751175, + "rewards/reward_func/std": 0.4730146527290344, + "sampling/importance_sampling_ratio/max": 1.930320382118225, + "sampling/importance_sampling_ratio/mean": 1.0938146114349365, + "sampling/importance_sampling_ratio/min": 0.44256508350372314, + "sampling/sampling_logp_difference/max": 0.29829633235931396, + "sampling/sampling_logp_difference/mean": 0.027022160589694977, + "step": 259, + "step_time": 78.32750187598867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35829824209213257, + "epoch": 0.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.167080044746399, + "kl": 0.013501507230103016, + "learning_rate": 4.343345283966901e-06, + "loss": -0.1633, + "num_tokens": 1449057.0, + "reward": 0.4612500071525574, + "reward_std": 0.6074321269989014, + "rewards/reward_func/mean": 0.4612500071525574, + "rewards/reward_func/std": 0.5628102421760559, + "sampling/importance_sampling_ratio/max": 1.504439115524292, + "sampling/importance_sampling_ratio/mean": 1.075119972229004, + "sampling/importance_sampling_ratio/min": 0.27312949299812317, + "sampling/sampling_logp_difference/max": 0.33127808570861816, + "sampling/sampling_logp_difference/mean": 0.026082661002874374, + "step": 260, + "step_time": 60.16271702598897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 44.25, + "completions/mean_terminated_length": 44.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3394550681114197, + "epoch": 0.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2084001302719116, + "kl": 0.023300688713788986, + "learning_rate": 4.337865984268002e-06, + "loss": -0.0475, + "num_tokens": 1454514.0, + "reward": 0.21000000834465027, + "reward_std": 0.5280653238296509, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.48890548944473267, + "sampling/importance_sampling_ratio/max": 1.85581374168396, + "sampling/importance_sampling_ratio/mean": 1.034727692604065, + "sampling/importance_sampling_ratio/min": 0.5134819149971008, + "sampling/sampling_logp_difference/max": 0.6527895927429199, + "sampling/sampling_logp_difference/mean": 0.028075508773326874, + "step": 261, + "step_time": 65.41974141600076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3510357737541199, + "epoch": 0.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0556254386901855, + "kl": 0.016779771074652672, + "learning_rate": 4.33236740622459e-06, + "loss": -0.177, + "num_tokens": 1460819.0, + "reward": -0.06750000268220901, + "reward_std": 0.056236058473587036, + "rewards/reward_func/mean": -0.06750000268220901, + "rewards/reward_func/std": 0.05548487976193428, + "sampling/importance_sampling_ratio/max": 2.7717020511627197, + "sampling/importance_sampling_ratio/mean": 1.2683167457580566, + "sampling/importance_sampling_ratio/min": 0.6609295010566711, + "sampling/sampling_logp_difference/max": 0.4664306640625, + "sampling/sampling_logp_difference/mean": 0.024730544537305832, + "step": 262, + "step_time": 89.55936830199789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3273940086364746, + "epoch": 0.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.300922155380249, + "kl": 0.0253826305270195, + "learning_rate": 4.326849607514149e-06, + "loss": -0.1908, + "num_tokens": 1466312.0, + "reward": 0.32750001549720764, + "reward_std": 0.5473343133926392, + "rewards/reward_func/mean": 0.32750001549720764, + "rewards/reward_func/std": 0.5286033153533936, + "sampling/importance_sampling_ratio/max": 1.702580213546753, + "sampling/importance_sampling_ratio/mean": 1.12638521194458, + "sampling/importance_sampling_ratio/min": 0.5338081121444702, + "sampling/sampling_logp_difference/max": 0.4523458480834961, + "sampling/sampling_logp_difference/mean": 0.024661045521497726, + "step": 263, + "step_time": 67.0695453399967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3104347586631775, + "epoch": 0.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8551791906356812, + "kl": 0.013168178498744965, + "learning_rate": 4.321312646015775e-06, + "loss": -0.0571, + "num_tokens": 1471010.0, + "reward": 0.3400000035762787, + "reward_std": 0.5720411539077759, + "rewards/reward_func/mean": 0.3400000035762787, + "rewards/reward_func/std": 0.5474616289138794, + "sampling/importance_sampling_ratio/max": 1.145720362663269, + "sampling/importance_sampling_ratio/mean": 0.6736248135566711, + "sampling/importance_sampling_ratio/min": 0.32681626081466675, + "sampling/sampling_logp_difference/max": 0.506934404373169, + "sampling/sampling_logp_difference/mean": 0.022311819717288017, + "step": 264, + "step_time": 69.11695815299754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.36542245745658875, + "epoch": 0.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7678577899932861, + "kl": 0.01349552534520626, + "learning_rate": 4.315756579809575e-06, + "loss": -0.0131, + "num_tokens": 1475783.0, + "reward": 0.45250001549720764, + "reward_std": 0.5276904106140137, + "rewards/reward_func/mean": 0.45250001549720764, + "rewards/reward_func/std": 0.5621070265769958, + "sampling/importance_sampling_ratio/max": 1.4794089794158936, + "sampling/importance_sampling_ratio/mean": 0.8411662578582764, + "sampling/importance_sampling_ratio/min": 0.2986375391483307, + "sampling/sampling_logp_difference/max": 0.3246455192565918, + "sampling/sampling_logp_difference/mean": 0.02376371994614601, + "step": 265, + "step_time": 60.26253752099001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.39109036326408386, + "epoch": 0.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.601122260093689, + "kl": 0.030363351106643677, + "learning_rate": 4.3101814671760546e-06, + "loss": 0.1835, + "num_tokens": 1480977.0, + "reward": 0.19500000774860382, + "reward_std": 0.498978853225708, + "rewards/reward_func/mean": 0.19500000774860382, + "rewards/reward_func/std": 0.46309521794319153, + "sampling/importance_sampling_ratio/max": 1.9800269603729248, + "sampling/importance_sampling_ratio/mean": 1.074782133102417, + "sampling/importance_sampling_ratio/min": 0.28303632140159607, + "sampling/sampling_logp_difference/max": 0.3251028060913086, + "sampling/sampling_logp_difference/mean": 0.03300042822957039, + "step": 266, + "step_time": 73.40508937300183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.31551459431648254, + "epoch": 0.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2413638830184937, + "kl": 0.046541355550289154, + "learning_rate": 4.304587366595506e-06, + "loss": 0.0603, + "num_tokens": 1486647.0, + "reward": -0.03125, + "reward_std": 0.03729227930307388, + "rewards/reward_func/mean": -0.03125, + "rewards/reward_func/std": 0.035632047802209854, + "sampling/importance_sampling_ratio/max": 1.5068446397781372, + "sampling/importance_sampling_ratio/mean": 1.0979515314102173, + "sampling/importance_sampling_ratio/min": 0.7618433237075806, + "sampling/sampling_logp_difference/max": 0.4640469551086426, + "sampling/sampling_logp_difference/mean": 0.021073922514915466, + "step": 267, + "step_time": 78.26023236100446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 43.0, + "completions/mean_terminated_length": 43.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3318823575973511, + "epoch": 0.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2611191272735596, + "kl": 0.04281270503997803, + "learning_rate": 4.298974336747397e-06, + "loss": 0.1527, + "num_tokens": 1491435.0, + "reward": 0.4449999928474426, + "reward_std": 0.5528978109359741, + "rewards/reward_func/mean": 0.4449999928474426, + "rewards/reward_func/std": 0.5943784117698669, + "sampling/importance_sampling_ratio/max": 2.169663190841675, + "sampling/importance_sampling_ratio/mean": 1.2059040069580078, + "sampling/importance_sampling_ratio/min": 0.5681382417678833, + "sampling/sampling_logp_difference/max": 0.44361448287963867, + "sampling/sampling_logp_difference/mean": 0.02404342032968998, + "step": 268, + "step_time": 42.1042278399982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3449662923812866, + "epoch": 0.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4394900798797607, + "kl": 0.04537257179617882, + "learning_rate": 4.2933424365097565e-06, + "loss": -0.0417, + "num_tokens": 1497478.0, + "reward": 0.23375000059604645, + "reward_std": 0.3076711595058441, + "rewards/reward_func/mean": 0.23375000059604645, + "rewards/reward_func/std": 0.4731939435005188, + "sampling/importance_sampling_ratio/max": 1.6045145988464355, + "sampling/importance_sampling_ratio/mean": 0.9230844974517822, + "sampling/importance_sampling_ratio/min": 0.4231981933116913, + "sampling/sampling_logp_difference/max": 0.4870121479034424, + "sampling/sampling_logp_difference/mean": 0.027318792417645454, + "step": 269, + "step_time": 78.79464126299717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3510008454322815, + "epoch": 0.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2418956756591797, + "kl": 0.018656443804502487, + "learning_rate": 4.287691724958551e-06, + "loss": 0.0041, + "num_tokens": 1502743.0, + "reward": 0.05625000223517418, + "reward_std": 0.30281367897987366, + "rewards/reward_func/mean": 0.05625000223517418, + "rewards/reward_func/std": 0.38615089654922485, + "sampling/importance_sampling_ratio/max": 1.43633234500885, + "sampling/importance_sampling_ratio/mean": 0.9880182147026062, + "sampling/importance_sampling_ratio/min": 0.5201124548912048, + "sampling/sampling_logp_difference/max": 0.4504268169403076, + "sampling/sampling_logp_difference/mean": 0.025521527975797653, + "step": 270, + "step_time": 61.80690009400132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3246690034866333, + "epoch": 0.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.149324893951416, + "kl": 0.07095597684383392, + "learning_rate": 4.282022261367074e-06, + "loss": 0.1048, + "num_tokens": 1508366.0, + "reward": 0.33375000953674316, + "reward_std": 0.5544325113296509, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5303351283073425, + "sampling/importance_sampling_ratio/max": 2.206528425216675, + "sampling/importance_sampling_ratio/mean": 0.8827699422836304, + "sampling/importance_sampling_ratio/min": 0.25518810749053955, + "sampling/sampling_logp_difference/max": 1.0000684261322021, + "sampling/sampling_logp_difference/mean": 0.03170555830001831, + "step": 271, + "step_time": 75.03205436599092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.30604857206344604, + "epoch": 0.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9059810638427734, + "kl": 0.059660904109478, + "learning_rate": 4.276334105205312e-06, + "loss": -0.2118, + "num_tokens": 1513438.0, + "reward": 0.06875000149011612, + "reward_std": 0.2786497473716736, + "rewards/reward_func/mean": 0.06875000149011612, + "rewards/reward_func/std": 0.36317792534828186, + "sampling/importance_sampling_ratio/max": 2.2432162761688232, + "sampling/importance_sampling_ratio/mean": 0.9688401222229004, + "sampling/importance_sampling_ratio/min": 0.4208078682422638, + "sampling/sampling_logp_difference/max": 0.7957940101623535, + "sampling/sampling_logp_difference/mean": 0.025905363261699677, + "step": 272, + "step_time": 70.6048888520163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.42150887846946716, + "epoch": 0.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.848087191581726, + "kl": 0.04411306977272034, + "learning_rate": 4.270627316139333e-06, + "loss": 0.0721, + "num_tokens": 1519207.0, + "reward": 0.3174999952316284, + "reward_std": 0.3132410943508148, + "rewards/reward_func/mean": 0.3174999952316284, + "rewards/reward_func/std": 0.5588700175285339, + "sampling/importance_sampling_ratio/max": 2.6300036907196045, + "sampling/importance_sampling_ratio/mean": 1.3033478260040283, + "sampling/importance_sampling_ratio/min": 0.6930631399154663, + "sampling/sampling_logp_difference/max": 0.4798305034637451, + "sampling/sampling_logp_difference/mean": 0.027141718193888664, + "step": 273, + "step_time": 58.65063955899677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.375, + "completions/mean_terminated_length": 45.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.346945196390152, + "epoch": 0.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3174351453781128, + "kl": 0.03675675392150879, + "learning_rate": 4.264901954030655e-06, + "loss": 0.2718, + "num_tokens": 1524595.0, + "reward": 0.3349999785423279, + "reward_std": 0.5533304214477539, + "rewards/reward_func/mean": 0.3349999785423279, + "rewards/reward_func/std": 0.5276091694831848, + "sampling/importance_sampling_ratio/max": 1.823628306388855, + "sampling/importance_sampling_ratio/mean": 1.3205350637435913, + "sampling/importance_sampling_ratio/min": 0.5515703558921814, + "sampling/sampling_logp_difference/max": 0.6266647577285767, + "sampling/sampling_logp_difference/mean": 0.027004873380064964, + "step": 274, + "step_time": 77.85147952000261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.34322333335876465, + "epoch": 0.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5239591598510742, + "kl": 0.03387777507305145, + "learning_rate": 4.259158078935616e-06, + "loss": 0.0834, + "num_tokens": 1530599.0, + "reward": 0.6112500429153442, + "reward_std": 0.5479111671447754, + "rewards/reward_func/mean": 0.6112500429153442, + "rewards/reward_func/std": 0.5258309841156006, + "sampling/importance_sampling_ratio/max": 1.549354910850525, + "sampling/importance_sampling_ratio/mean": 0.9935581088066101, + "sampling/importance_sampling_ratio/min": 0.30347806215286255, + "sampling/sampling_logp_difference/max": 0.5402736663818359, + "sampling/sampling_logp_difference/mean": 0.026231858879327774, + "step": 275, + "step_time": 44.57912472402677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.35677075386047363, + "epoch": 0.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.31052827835083, + "kl": 0.03178555518388748, + "learning_rate": 4.2533957511047485e-06, + "loss": -0.2195, + "num_tokens": 1536340.0, + "reward": 0.33500000834465027, + "reward_std": 0.5740325450897217, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.551310122013092, + "sampling/importance_sampling_ratio/max": 1.479032039642334, + "sampling/importance_sampling_ratio/mean": 1.0965967178344727, + "sampling/importance_sampling_ratio/min": 0.65904301404953, + "sampling/sampling_logp_difference/max": 0.5930355787277222, + "sampling/sampling_logp_difference/mean": 0.027675746008753777, + "step": 276, + "step_time": 56.27619910798967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 43.75, + "completions/mean_terminated_length": 43.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3377889394760132, + "epoch": 0.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.999190628528595, + "kl": 0.032812558114528656, + "learning_rate": 4.247615030982144e-06, + "loss": 0.0927, + "num_tokens": 1541902.0, + "reward": 0.4775000214576721, + "reward_std": 0.4942038655281067, + "rewards/reward_func/mean": 0.4775000214576721, + "rewards/reward_func/std": 0.527304470539093, + "sampling/importance_sampling_ratio/max": 1.5036600828170776, + "sampling/importance_sampling_ratio/mean": 0.9121675491333008, + "sampling/importance_sampling_ratio/min": 0.3932625651359558, + "sampling/sampling_logp_difference/max": 0.6314131021499634, + "sampling/sampling_logp_difference/mean": 0.027674881741404533, + "step": 277, + "step_time": 66.32918151500053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 44.75, + "completions/mean_terminated_length": 44.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35427534580230713, + "epoch": 0.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1933726072311401, + "kl": 0.06276769191026688, + "learning_rate": 4.241815979204822e-06, + "loss": 0.1654, + "num_tokens": 1548032.0, + "reward": 0.19750000536441803, + "reward_std": 0.32018035650253296, + "rewards/reward_func/mean": 0.19750000536441803, + "rewards/reward_func/std": 0.48417091369628906, + "sampling/importance_sampling_ratio/max": 1.7151527404785156, + "sampling/importance_sampling_ratio/mean": 1.0098499059677124, + "sampling/importance_sampling_ratio/min": 0.11691775172948837, + "sampling/sampling_logp_difference/max": 1.2129077911376953, + "sampling/sampling_logp_difference/mean": 0.02612270414829254, + "step": 278, + "step_time": 88.54080397897633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 42.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3449207544326782, + "epoch": 0.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.924571692943573, + "kl": 0.040953975170850754, + "learning_rate": 4.235998656602091e-06, + "loss": 0.1445, + "num_tokens": 1553989.0, + "reward": -0.06750000268220901, + "reward_std": 0.03040887415409088, + "rewards/reward_func/mean": -0.06750000268220901, + "rewards/reward_func/std": 0.051199886947870255, + "sampling/importance_sampling_ratio/max": 0.9909558892250061, + "sampling/importance_sampling_ratio/mean": 0.7079716920852661, + "sampling/importance_sampling_ratio/min": 0.47882190346717834, + "sampling/sampling_logp_difference/max": 0.6735103130340576, + "sampling/sampling_logp_difference/mean": 0.03071964532136917, + "step": 279, + "step_time": 82.45593494997593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.38967427611351013, + "epoch": 0.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2218159437179565, + "kl": 0.027102379128336906, + "learning_rate": 4.230163124194913e-06, + "loss": 0.0018, + "num_tokens": 1559709.0, + "reward": 0.08249999582767487, + "reward_std": 0.27100443840026855, + "rewards/reward_func/mean": 0.08249999582767487, + "rewards/reward_func/std": 0.3715507984161377, + "sampling/importance_sampling_ratio/max": 1.6555224657058716, + "sampling/importance_sampling_ratio/mean": 1.070943832397461, + "sampling/importance_sampling_ratio/min": 0.5340960025787354, + "sampling/sampling_logp_difference/max": 0.4693126082420349, + "sampling/sampling_logp_difference/mean": 0.027980361133813858, + "step": 280, + "step_time": 72.15682938401005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.29565370082855225, + "epoch": 0.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.19646418094635, + "kl": 0.03536316379904747, + "learning_rate": 4.224309443195261e-06, + "loss": -0.2295, + "num_tokens": 1565505.0, + "reward": 0.48250001668930054, + "reward_std": 0.5834507346153259, + "rewards/reward_func/mean": 0.48250001668930054, + "rewards/reward_func/std": 0.5402050018310547, + "sampling/importance_sampling_ratio/max": 1.867733359336853, + "sampling/importance_sampling_ratio/mean": 1.0055427551269531, + "sampling/importance_sampling_ratio/min": 0.4048961102962494, + "sampling/sampling_logp_difference/max": 0.6614785194396973, + "sampling/sampling_logp_difference/mean": 0.027097908779978752, + "step": 281, + "step_time": 51.57847329697688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 56.125, + "completions/mean_terminated_length": 56.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.35139578580856323, + "epoch": 0.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.086079478263855, + "kl": 0.022778205573558807, + "learning_rate": 4.218437675005479e-06, + "loss": -0.2493, + "num_tokens": 1571510.0, + "reward": 0.5787500143051147, + "reward_std": 0.5751041173934937, + "rewards/reward_func/mean": 0.5787500143051147, + "rewards/reward_func/std": 0.5546797513961792, + "sampling/importance_sampling_ratio/max": 1.7726209163665771, + "sampling/importance_sampling_ratio/mean": 1.1253960132598877, + "sampling/importance_sampling_ratio/min": 0.4784102737903595, + "sampling/sampling_logp_difference/max": 0.4526965618133545, + "sampling/sampling_logp_difference/mean": 0.02535804733633995, + "step": 282, + "step_time": 64.53625944399391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.398438036441803, + "epoch": 0.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.212346076965332, + "kl": 0.037971317768096924, + "learning_rate": 4.212547881217637e-06, + "loss": -0.064, + "num_tokens": 1577608.0, + "reward": 0.2212499976158142, + "reward_std": 0.29377394914627075, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.4595475196838379, + "sampling/importance_sampling_ratio/max": 1.3747642040252686, + "sampling/importance_sampling_ratio/mean": 0.9095951914787292, + "sampling/importance_sampling_ratio/min": 0.45244070887565613, + "sampling/sampling_logp_difference/max": 0.7049179077148438, + "sampling/sampling_logp_difference/mean": 0.030841922387480736, + "step": 283, + "step_time": 84.87170067700208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.36814165115356445, + "epoch": 0.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1691949367523193, + "kl": 0.036575593054294586, + "learning_rate": 4.206640123612885e-06, + "loss": 0.0775, + "num_tokens": 1582975.0, + "reward": 0.7237499952316284, + "reward_std": 0.5162468552589417, + "rewards/reward_func/mean": 0.7237499952316284, + "rewards/reward_func/std": 0.4783584177494049, + "sampling/importance_sampling_ratio/max": 1.1772783994674683, + "sampling/importance_sampling_ratio/mean": 0.7350926399230957, + "sampling/importance_sampling_ratio/min": 0.24923691153526306, + "sampling/sampling_logp_difference/max": 0.8092962503433228, + "sampling/sampling_logp_difference/mean": 0.03558259829878807, + "step": 284, + "step_time": 59.08952103398042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 40.375, + "completions/mean_terminated_length": 40.375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.31116950511932373, + "epoch": 0.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1499625444412231, + "kl": 0.05957644805312157, + "learning_rate": 4.2007144641608035e-06, + "loss": 0.2114, + "num_tokens": 1588426.0, + "reward": 0.20625001192092896, + "reward_std": 0.29973241686820984, + "rewards/reward_func/mean": 0.20625001192092896, + "rewards/reward_func/std": 0.4607428014278412, + "sampling/importance_sampling_ratio/max": 1.3515949249267578, + "sampling/importance_sampling_ratio/mean": 0.9128226041793823, + "sampling/importance_sampling_ratio/min": 0.24216710031032562, + "sampling/sampling_logp_difference/max": 0.3609771728515625, + "sampling/sampling_logp_difference/mean": 0.025414273142814636, + "step": 285, + "step_time": 69.37833641498582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 41.875, + "completions/mean_terminated_length": 41.875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.40447139739990234, + "epoch": 0.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3697779178619385, + "kl": 0.02398294396698475, + "learning_rate": 4.194770965018758e-06, + "loss": 0.2327, + "num_tokens": 1594255.0, + "reward": 0.4737499952316284, + "reward_std": 0.5166642665863037, + "rewards/reward_func/mean": 0.4737499952316284, + "rewards/reward_func/std": 0.5499594211578369, + "sampling/importance_sampling_ratio/max": 1.7312536239624023, + "sampling/importance_sampling_ratio/mean": 0.9678086638450623, + "sampling/importance_sampling_ratio/min": 0.5032089352607727, + "sampling/sampling_logp_difference/max": 0.43535709381103516, + "sampling/sampling_logp_difference/mean": 0.030212290585041046, + "step": 286, + "step_time": 86.86279435199685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35356995463371277, + "epoch": 0.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1606370210647583, + "kl": 0.023810427635908127, + "learning_rate": 4.188809688531241e-06, + "loss": -0.1423, + "num_tokens": 1599919.0, + "reward": 0.08499999344348907, + "reward_std": 0.28996607661247253, + "rewards/reward_func/mean": 0.08499999344348907, + "rewards/reward_func/std": 0.3730185925960541, + "sampling/importance_sampling_ratio/max": 1.213178277015686, + "sampling/importance_sampling_ratio/mean": 0.8015030026435852, + "sampling/importance_sampling_ratio/min": 0.18578791618347168, + "sampling/sampling_logp_difference/max": 0.367124080657959, + "sampling/sampling_logp_difference/mean": 0.02456764504313469, + "step": 287, + "step_time": 86.99813774897484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 42.5, + "completions/mean_terminated_length": 42.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.37545299530029297, + "epoch": 0.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9484260678291321, + "kl": 0.03164836764335632, + "learning_rate": 4.182830697229223e-06, + "loss": 0.0409, + "num_tokens": 1605747.0, + "reward": 0.22625000774860382, + "reward_std": 0.3063211739063263, + "rewards/reward_func/mean": 0.22625000774860382, + "rewards/reward_func/std": 0.4631241261959076, + "sampling/importance_sampling_ratio/max": 1.3765383958816528, + "sampling/importance_sampling_ratio/mean": 0.9212698340415955, + "sampling/importance_sampling_ratio/min": 0.42979246377944946, + "sampling/sampling_logp_difference/max": 0.46885204315185547, + "sampling/sampling_logp_difference/mean": 0.0255972221493721, + "step": 288, + "step_time": 67.68871720400057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3726646602153778, + "epoch": 0.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2278378009796143, + "kl": 0.03291169926524162, + "learning_rate": 4.176834053829492e-06, + "loss": 0.0774, + "num_tokens": 1611005.0, + "reward": 0.33249998092651367, + "reward_std": 0.5452134609222412, + "rewards/reward_func/mean": 0.33249998092651367, + "rewards/reward_func/std": 0.5284410715103149, + "sampling/importance_sampling_ratio/max": 1.2047574520111084, + "sampling/importance_sampling_ratio/mean": 0.9297256469726562, + "sampling/importance_sampling_ratio/min": 0.5736981630325317, + "sampling/sampling_logp_difference/max": 0.4195805788040161, + "sampling/sampling_logp_difference/mean": 0.025271501392126083, + "step": 289, + "step_time": 57.347708321001846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.368205189704895, + "epoch": 0.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.06058931350708, + "kl": 0.01727902702987194, + "learning_rate": 4.170819821234001e-06, + "loss": -0.1661, + "num_tokens": 1616685.0, + "reward": 0.6000000238418579, + "reward_std": 0.5506213903427124, + "rewards/reward_func/mean": 0.6000000238418579, + "rewards/reward_func/std": 0.5333184599876404, + "sampling/importance_sampling_ratio/max": 1.8462737798690796, + "sampling/importance_sampling_ratio/mean": 0.8948688507080078, + "sampling/importance_sampling_ratio/min": 0.4318339228630066, + "sampling/sampling_logp_difference/max": 0.5361829996109009, + "sampling/sampling_logp_difference/mean": 0.030598482117056847, + "step": 290, + "step_time": 63.116088277020026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 45.875, + "completions/mean_terminated_length": 45.875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3375159502029419, + "epoch": 0.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7271595001220703, + "kl": 0.036022864282131195, + "learning_rate": 4.164788062529203e-06, + "loss": 0.3231, + "num_tokens": 1622232.0, + "reward": 0.3474999964237213, + "reward_std": 0.2686045467853546, + "rewards/reward_func/mean": 0.3474999964237213, + "rewards/reward_func/std": 0.5296832323074341, + "sampling/importance_sampling_ratio/max": 2.602046489715576, + "sampling/importance_sampling_ratio/mean": 1.1395049095153809, + "sampling/importance_sampling_ratio/min": 0.46707242727279663, + "sampling/sampling_logp_difference/max": 0.3359344005584717, + "sampling/sampling_logp_difference/mean": 0.025325238704681396, + "step": 291, + "step_time": 80.50347790899104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 45.375, + "completions/mean_terminated_length": 45.375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.36788409948349, + "epoch": 0.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0641146898269653, + "kl": 0.027490928769111633, + "learning_rate": 4.158738840985393e-06, + "loss": 0.1119, + "num_tokens": 1627699.0, + "reward": 0.17999999225139618, + "reward_std": 0.3215157687664032, + "rewards/reward_func/mean": 0.17999999225139618, + "rewards/reward_func/std": 0.4744320213794708, + "sampling/importance_sampling_ratio/max": 1.5140283107757568, + "sampling/importance_sampling_ratio/mean": 0.6938580870628357, + "sampling/importance_sampling_ratio/min": 0.20368647575378418, + "sampling/sampling_logp_difference/max": 0.8070402145385742, + "sampling/sampling_logp_difference/mean": 0.02949894405901432, + "step": 292, + "step_time": 78.2215075980057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 45.125, + "completions/mean_terminated_length": 45.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.4428096115589142, + "epoch": 0.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.749643325805664, + "kl": 0.04018591344356537, + "learning_rate": 4.1526722200560445e-06, + "loss": -0.1564, + "num_tokens": 1633177.0, + "reward": 0.34375, + "reward_std": 0.5563769340515137, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5327540040016174, + "sampling/importance_sampling_ratio/max": 2.5977683067321777, + "sampling/importance_sampling_ratio/mean": 0.9267335534095764, + "sampling/importance_sampling_ratio/min": 0.43353283405303955, + "sampling/sampling_logp_difference/max": 0.6082069873809814, + "sampling/sampling_logp_difference/mean": 0.036718130111694336, + "step": 293, + "step_time": 72.21064010998816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3257609009742737, + "epoch": 0.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1807507276535034, + "kl": 0.030308792367577553, + "learning_rate": 4.146588263377137e-06, + "loss": 0.0547, + "num_tokens": 1638629.0, + "reward": 0.5962499976158142, + "reward_std": 0.5583738088607788, + "rewards/reward_func/mean": 0.5962499976158142, + "rewards/reward_func/std": 0.5385679006576538, + "sampling/importance_sampling_ratio/max": 1.4324092864990234, + "sampling/importance_sampling_ratio/mean": 0.9339421391487122, + "sampling/importance_sampling_ratio/min": 0.6571045517921448, + "sampling/sampling_logp_difference/max": 0.35495901107788086, + "sampling/sampling_logp_difference/mean": 0.021137617528438568, + "step": 294, + "step_time": 68.63880399399204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.36071956157684326, + "epoch": 0.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4050084352493286, + "kl": 0.03706521913409233, + "learning_rate": 4.140487034766499e-06, + "loss": 0.0768, + "num_tokens": 1644795.0, + "reward": 0.35249999165534973, + "reward_std": 0.5524863600730896, + "rewards/reward_func/mean": 0.35249999165534973, + "rewards/reward_func/std": 0.5344623327255249, + "sampling/importance_sampling_ratio/max": 1.5176059007644653, + "sampling/importance_sampling_ratio/mean": 0.9870838522911072, + "sampling/importance_sampling_ratio/min": 0.6246324777603149, + "sampling/sampling_logp_difference/max": 0.8912210464477539, + "sampling/sampling_logp_difference/mean": 0.028878837823867798, + "step": 295, + "step_time": 57.42247140299878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3431392312049866, + "epoch": 0.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1508780717849731, + "kl": 0.03153597190976143, + "learning_rate": 4.134368598223132e-06, + "loss": 0.1312, + "num_tokens": 1650107.0, + "reward": 0.3387500047683716, + "reward_std": 0.5518091320991516, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5335979461669922, + "sampling/importance_sampling_ratio/max": 1.6481400728225708, + "sampling/importance_sampling_ratio/mean": 0.934108555316925, + "sampling/importance_sampling_ratio/min": 0.31286314129829407, + "sampling/sampling_logp_difference/max": 0.3653430938720703, + "sampling/sampling_logp_difference/mean": 0.03261955454945564, + "step": 296, + "step_time": 48.63130289298715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 42.125, + "completions/mean_terminated_length": 42.125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.33156031370162964, + "epoch": 0.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4071409702301025, + "kl": 0.041910890489816666, + "learning_rate": 4.128233017926538e-06, + "loss": 0.2664, + "num_tokens": 1655436.0, + "reward": -0.0637499988079071, + "reward_std": 0.03450929373502731, + "rewards/reward_func/mean": -0.0637499988079071, + "rewards/reward_func/std": 0.04274091124534607, + "sampling/importance_sampling_ratio/max": 1.8012773990631104, + "sampling/importance_sampling_ratio/mean": 0.9478522539138794, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.553492546081543, + "sampling/sampling_logp_difference/mean": 0.030245978385210037, + "step": 297, + "step_time": 74.5723572280258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 43.875, + "completions/mean_terminated_length": 43.875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3426669239997864, + "epoch": 0.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1105406284332275, + "kl": 0.043890222907066345, + "learning_rate": 4.1220803582360545e-06, + "loss": -0.0848, + "num_tokens": 1661032.0, + "reward": -0.03999999910593033, + "reward_std": 0.03082464262843132, + "rewards/reward_func/mean": -0.03999999910593033, + "rewards/reward_func/std": 0.029760954901576042, + "sampling/importance_sampling_ratio/max": 1.2387455701828003, + "sampling/importance_sampling_ratio/mean": 0.9630196690559387, + "sampling/importance_sampling_ratio/min": 0.6839993596076965, + "sampling/sampling_logp_difference/max": 0.5896548628807068, + "sampling/sampling_logp_difference/mean": 0.029966674745082855, + "step": 298, + "step_time": 70.879077177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.36084866523742676, + "epoch": 0.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1439580917358398, + "kl": 0.04164861887693405, + "learning_rate": 4.115910683690167e-06, + "loss": 0.109, + "num_tokens": 1666142.0, + "reward": 0.46000000834465027, + "reward_std": 0.5153526067733765, + "rewards/reward_func/mean": 0.46000000834465027, + "rewards/reward_func/std": 0.5593363046646118, + "sampling/importance_sampling_ratio/max": 1.1726311445236206, + "sampling/importance_sampling_ratio/mean": 0.7731176614761353, + "sampling/importance_sampling_ratio/min": 0.44161850214004517, + "sampling/sampling_logp_difference/max": 0.5879793167114258, + "sampling/sampling_logp_difference/mean": 0.029971588402986526, + "step": 299, + "step_time": 40.97617705501034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.35997629165649414, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2015734910964966, + "kl": 0.051748767495155334, + "learning_rate": 4.109724059005844e-06, + "loss": -0.1698, + "num_tokens": 1671675.0, + "reward": 0.19499999284744263, + "reward_std": 0.5328658819198608, + "rewards/reward_func/mean": 0.19499999284744263, + "rewards/reward_func/std": 0.4941948652267456, + "sampling/importance_sampling_ratio/max": 1.5676287412643433, + "sampling/importance_sampling_ratio/mean": 0.8457791805267334, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.410290241241455, + "sampling/sampling_logp_difference/mean": 0.03373635932803154, + "step": 300, + "step_time": 71.877353650023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3280709981918335, + "epoch": 0.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5422002077102661, + "kl": 0.04158281534910202, + "learning_rate": 4.1035205490778505e-06, + "loss": -0.1959, + "num_tokens": 1677448.0, + "reward": 0.3199999928474426, + "reward_std": 0.5629400610923767, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.5402909517288208, + "sampling/importance_sampling_ratio/max": 1.9516421556472778, + "sampling/importance_sampling_ratio/mean": 1.1000713109970093, + "sampling/importance_sampling_ratio/min": 0.3914698660373688, + "sampling/sampling_logp_difference/max": 0.4937098026275635, + "sampling/sampling_logp_difference/mean": 0.025912173092365265, + "step": 301, + "step_time": 59.86676025000634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3313376307487488, + "epoch": 0.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.997847855091095, + "kl": 0.04069218039512634, + "learning_rate": 4.09730021897807e-06, + "loss": -0.0619, + "num_tokens": 1683406.0, + "reward": 0.19749999046325684, + "reward_std": 0.3082555830478668, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.4607369899749756, + "sampling/importance_sampling_ratio/max": 1.2228721380233765, + "sampling/importance_sampling_ratio/mean": 0.8224000930786133, + "sampling/importance_sampling_ratio/min": 0.42023351788520813, + "sampling/sampling_logp_difference/max": 0.5434841513633728, + "sampling/sampling_logp_difference/mean": 0.02660995163023472, + "step": 302, + "step_time": 57.52045150997583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.31421804428100586, + "epoch": 0.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4279636144638062, + "kl": 0.07083047926425934, + "learning_rate": 4.091063133954821e-06, + "loss": 0.2061, + "num_tokens": 1689378.0, + "reward": 0.19249999523162842, + "reward_std": 0.5418117642402649, + "rewards/reward_func/mean": 0.19249999523162842, + "rewards/reward_func/std": 0.5016757845878601, + "sampling/importance_sampling_ratio/max": 2.132955312728882, + "sampling/importance_sampling_ratio/mean": 1.1564010381698608, + "sampling/importance_sampling_ratio/min": 0.4834826588630676, + "sampling/sampling_logp_difference/max": 0.5907609462738037, + "sampling/sampling_logp_difference/mean": 0.03361092135310173, + "step": 303, + "step_time": 67.22909496401553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3458764851093292, + "epoch": 0.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9183087348937988, + "kl": 0.04568685591220856, + "learning_rate": 4.084809359432175e-06, + "loss": -0.0881, + "num_tokens": 1694776.0, + "reward": 0.09000000357627869, + "reward_std": 0.2616836130619049, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.36847177147865295, + "sampling/importance_sampling_ratio/max": 1.7175835371017456, + "sampling/importance_sampling_ratio/mean": 0.9860607385635376, + "sampling/importance_sampling_ratio/min": 0.327860951423645, + "sampling/sampling_logp_difference/max": 0.5655612945556641, + "sampling/sampling_logp_difference/mean": 0.028499091044068336, + "step": 304, + "step_time": 72.93971385998884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.331033319234848, + "epoch": 0.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8779069185256958, + "kl": 0.04410824924707413, + "learning_rate": 4.0785389610092684e-06, + "loss": 0.0479, + "num_tokens": 1700586.0, + "reward": 0.59375, + "reward_std": 0.2722131311893463, + "rewards/reward_func/mean": 0.59375, + "rewards/reward_func/std": 0.5475644469261169, + "sampling/importance_sampling_ratio/max": 1.7427366971969604, + "sampling/importance_sampling_ratio/mean": 1.1339176893234253, + "sampling/importance_sampling_ratio/min": 0.7268555760383606, + "sampling/sampling_logp_difference/max": 0.3699074983596802, + "sampling/sampling_logp_difference/mean": 0.02840990573167801, + "step": 305, + "step_time": 38.05158003201359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 44.75, + "completions/mean_terminated_length": 44.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3475850224494934, + "epoch": 0.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3732694387435913, + "kl": 0.05851783603429794, + "learning_rate": 4.072252004459612e-06, + "loss": -0.4087, + "num_tokens": 1706255.0, + "reward": 0.4462500214576721, + "reward_std": 0.5143392086029053, + "rewards/reward_func/mean": 0.4462500214576721, + "rewards/reward_func/std": 0.5704118609428406, + "sampling/importance_sampling_ratio/max": 2.909179925918579, + "sampling/importance_sampling_ratio/mean": 1.355375051498413, + "sampling/importance_sampling_ratio/min": 0.4884859621524811, + "sampling/sampling_logp_difference/max": 0.7288825511932373, + "sampling/sampling_logp_difference/mean": 0.031946711242198944, + "step": 306, + "step_time": 56.35153491600067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 44.375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.37247800827026367, + "epoch": 0.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.049514889717102, + "kl": 0.05375540256500244, + "learning_rate": 4.065948555730405e-06, + "loss": 0.1078, + "num_tokens": 1712211.0, + "reward": 0.45625001192092896, + "reward_std": 0.6163418889045715, + "rewards/reward_func/mean": 0.45625001192092896, + "rewards/reward_func/std": 0.5708875060081482, + "sampling/importance_sampling_ratio/max": 1.4518539905548096, + "sampling/importance_sampling_ratio/mean": 0.7474272847175598, + "sampling/importance_sampling_ratio/min": 0.3841031789779663, + "sampling/sampling_logp_difference/max": 0.5305154323577881, + "sampling/sampling_logp_difference/mean": 0.030646320432424545, + "step": 307, + "step_time": 66.77364812800079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3100343346595764, + "epoch": 0.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0289682149887085, + "kl": 0.052720747888088226, + "learning_rate": 4.059628680941843e-06, + "loss": 0.0498, + "num_tokens": 1717818.0, + "reward": 0.21125000715255737, + "reward_std": 0.29770827293395996, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.4665508270263672, + "sampling/importance_sampling_ratio/max": 1.2022721767425537, + "sampling/importance_sampling_ratio/mean": 0.9475799202919006, + "sampling/importance_sampling_ratio/min": 0.5206012725830078, + "sampling/sampling_logp_difference/max": 0.5640921592712402, + "sampling/sampling_logp_difference/mean": 0.03175481781363487, + "step": 308, + "step_time": 52.89879798798938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 44.625, + "completions/mean_terminated_length": 44.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.392910361289978, + "epoch": 0.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1463991403579712, + "kl": 0.09175321459770203, + "learning_rate": 4.053292446386422e-06, + "loss": 0.103, + "num_tokens": 1722948.0, + "reward": 0.32750001549720764, + "reward_std": 0.5502669811248779, + "rewards/reward_func/mean": 0.32750001549720764, + "rewards/reward_func/std": 0.5303031802177429, + "sampling/importance_sampling_ratio/max": 1.6144081354141235, + "sampling/importance_sampling_ratio/mean": 0.8773033022880554, + "sampling/importance_sampling_ratio/min": 0.24481238424777985, + "sampling/sampling_logp_difference/max": 0.6958191394805908, + "sampling/sampling_logp_difference/mean": 0.03132324665784836, + "step": 309, + "step_time": 62.81438364399946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 43.375, + "completions/mean_terminated_length": 43.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3314189910888672, + "epoch": 0.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1365076303482056, + "kl": 0.045356642454862595, + "learning_rate": 4.046939918528243e-06, + "loss": -0.0211, + "num_tokens": 1728875.0, + "reward": -0.04874999821186066, + "reward_std": 0.03686491772532463, + "rewards/reward_func/mean": -0.04874999821186066, + "rewards/reward_func/std": 0.03482097014784813, + "sampling/importance_sampling_ratio/max": 1.382016897201538, + "sampling/importance_sampling_ratio/mean": 0.8258918523788452, + "sampling/importance_sampling_ratio/min": 0.43768084049224854, + "sampling/sampling_logp_difference/max": 0.34904003143310547, + "sampling/sampling_logp_difference/mean": 0.02661317214369774, + "step": 310, + "step_time": 84.43736876899493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.342917263507843, + "epoch": 0.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4713886976242065, + "kl": 0.060660719871520996, + "learning_rate": 4.040571164002319e-06, + "loss": 0.0434, + "num_tokens": 1734842.0, + "reward": 0.3512499928474426, + "reward_std": 0.5479995012283325, + "rewards/reward_func/mean": 0.3512499928474426, + "rewards/reward_func/std": 0.5240620970726013, + "sampling/importance_sampling_ratio/max": 1.7727338075637817, + "sampling/importance_sampling_ratio/mean": 0.9686833024024963, + "sampling/importance_sampling_ratio/min": 0.39146628975868225, + "sampling/sampling_logp_difference/max": 0.700446605682373, + "sampling/sampling_logp_difference/mean": 0.029514621943235397, + "step": 311, + "step_time": 68.79045250298805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3210405707359314, + "epoch": 0.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8161787390708923, + "kl": 0.04533413052558899, + "learning_rate": 4.034186249613869e-06, + "loss": 0.137, + "num_tokens": 1740368.0, + "reward": 0.0637500062584877, + "reward_std": 0.2749331593513489, + "rewards/reward_func/mean": 0.0637500062584877, + "rewards/reward_func/std": 0.3796215355396271, + "sampling/importance_sampling_ratio/max": 1.1369949579238892, + "sampling/importance_sampling_ratio/mean": 0.693924069404602, + "sampling/importance_sampling_ratio/min": 0.3688696622848511, + "sampling/sampling_logp_difference/max": 0.5726242065429688, + "sampling/sampling_logp_difference/mean": 0.02921966463327408, + "step": 312, + "step_time": 74.01787159900414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 40.625, + "completions/mean_terminated_length": 40.625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.3051733076572418, + "epoch": 0.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9667705297470093, + "kl": 0.04298641160130501, + "learning_rate": 4.027785242337626e-06, + "loss": 0.0425, + "num_tokens": 1745737.0, + "reward": 0.4637500047683716, + "reward_std": 0.6106890439987183, + "rewards/reward_func/mean": 0.4637500047683716, + "rewards/reward_func/std": 0.5657343864440918, + "sampling/importance_sampling_ratio/max": 1.2044254541397095, + "sampling/importance_sampling_ratio/mean": 0.7017180919647217, + "sampling/importance_sampling_ratio/min": 0.41121870279312134, + "sampling/sampling_logp_difference/max": 0.5601418018341064, + "sampling/sampling_logp_difference/mean": 0.03372935950756073, + "step": 313, + "step_time": 56.49958878697362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.37881386280059814, + "epoch": 0.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9144023060798645, + "kl": 0.03883805125951767, + "learning_rate": 4.021368209317126e-06, + "loss": 0.0706, + "num_tokens": 1750627.0, + "reward": 0.3125, + "reward_std": 0.5669680833816528, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5413936972618103, + "sampling/importance_sampling_ratio/max": 1.3021681308746338, + "sampling/importance_sampling_ratio/mean": 0.6122154593467712, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7377749681472778, + "sampling/sampling_logp_difference/mean": 0.03731653094291687, + "step": 314, + "step_time": 64.65534779199515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 44.375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.322407066822052, + "epoch": 0.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6095478534698486, + "kl": 0.06635308265686035, + "learning_rate": 4.014935217864009e-06, + "loss": 0.0829, + "num_tokens": 1756143.0, + "reward": 0.3400000333786011, + "reward_std": 0.5607088804244995, + "rewards/reward_func/mean": 0.3400000333786011, + "rewards/reward_func/std": 0.5389142632484436, + "sampling/importance_sampling_ratio/max": 2.9857375621795654, + "sampling/importance_sampling_ratio/mean": 1.097962737083435, + "sampling/importance_sampling_ratio/min": 0.29625648260116577, + "sampling/sampling_logp_difference/max": 0.9051809310913086, + "sampling/sampling_logp_difference/mean": 0.030315592885017395, + "step": 315, + "step_time": 75.3316736620036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.354464590549469, + "epoch": 0.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7500587701797485, + "kl": 0.041008904576301575, + "learning_rate": 4.008486335457312e-06, + "loss": 0.2378, + "num_tokens": 1761628.0, + "reward": 0.0949999988079071, + "reward_std": 0.28850340843200684, + "rewards/reward_func/mean": 0.0949999988079071, + "rewards/reward_func/std": 0.3677732050418854, + "sampling/importance_sampling_ratio/max": 1.8479022979736328, + "sampling/importance_sampling_ratio/mean": 0.8248315453529358, + "sampling/importance_sampling_ratio/min": 0.3391701281070709, + "sampling/sampling_logp_difference/max": 0.9850505590438843, + "sampling/sampling_logp_difference/mean": 0.026787061244249344, + "step": 316, + "step_time": 72.44107799098128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 46.625, + "completions/mean_terminated_length": 46.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.37994247674942017, + "epoch": 0.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.623810052871704, + "kl": 0.03418804332613945, + "learning_rate": 4.002021629742759e-06, + "loss": -0.0948, + "num_tokens": 1767506.0, + "reward": 0.07000000029802322, + "reward_std": 0.2821284532546997, + "rewards/reward_func/mean": 0.07000000029802322, + "rewards/reward_func/std": 0.37405118346214294, + "sampling/importance_sampling_ratio/max": 2.5057566165924072, + "sampling/importance_sampling_ratio/mean": 1.1749823093414307, + "sampling/importance_sampling_ratio/min": 0.4858405590057373, + "sampling/sampling_logp_difference/max": 0.3556022644042969, + "sampling/sampling_logp_difference/mean": 0.03094809502363205, + "step": 317, + "step_time": 73.08309103900683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3214326500892639, + "epoch": 0.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.192352294921875, + "kl": 0.05013597011566162, + "learning_rate": 3.995541168532055e-06, + "loss": 0.1681, + "num_tokens": 1772800.0, + "reward": 0.21500001847743988, + "reward_std": 0.30573850870132446, + "rewards/reward_func/mean": 0.21500001847743988, + "rewards/reward_func/std": 0.47563493251800537, + "sampling/importance_sampling_ratio/max": 2.769392967224121, + "sampling/importance_sampling_ratio/mean": 1.9062137603759766, + "sampling/importance_sampling_ratio/min": 0.9690021276473999, + "sampling/sampling_logp_difference/max": 0.4255542755126953, + "sampling/sampling_logp_difference/mean": 0.029431238770484924, + "step": 318, + "step_time": 78.88887128600618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3768948018550873, + "epoch": 0.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7573028802871704, + "kl": 0.041640929877758026, + "learning_rate": 3.989045019802171e-06, + "loss": -0.0145, + "num_tokens": 1778980.0, + "reward": 0.19999998807907104, + "reward_std": 0.31466037034988403, + "rewards/reward_func/mean": 0.19999998807907104, + "rewards/reward_func/std": 0.46757736802101135, + "sampling/importance_sampling_ratio/max": 2.0911169052124023, + "sampling/importance_sampling_ratio/mean": 1.2725701332092285, + "sampling/importance_sampling_ratio/min": 0.7229686379432678, + "sampling/sampling_logp_difference/max": 0.35713261365890503, + "sampling/sampling_logp_difference/mean": 0.030360868200659752, + "step": 319, + "step_time": 77.96896261701477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.36589163541793823, + "epoch": 0.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2818424701690674, + "kl": 0.03474745154380798, + "learning_rate": 3.982533251694632e-06, + "loss": -0.3233, + "num_tokens": 1785246.0, + "reward": 0.21000000834465027, + "reward_std": 0.308533251285553, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.4801190495491028, + "sampling/importance_sampling_ratio/max": 1.9405113458633423, + "sampling/importance_sampling_ratio/mean": 1.1001548767089844, + "sampling/importance_sampling_ratio/min": 0.4418053925037384, + "sampling/sampling_logp_difference/max": 0.6586148738861084, + "sampling/sampling_logp_difference/mean": 0.02688867226243019, + "step": 320, + "step_time": 76.73345412599156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.29908668994903564, + "epoch": 0.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0722907781600952, + "kl": 0.05384838581085205, + "learning_rate": 3.976005932514807e-06, + "loss": -0.0787, + "num_tokens": 1790214.0, + "reward": 0.45875000953674316, + "reward_std": 0.5092880129814148, + "rewards/reward_func/mean": 0.45875000953674316, + "rewards/reward_func/std": 0.5402231812477112, + "sampling/importance_sampling_ratio/max": 2.0527751445770264, + "sampling/importance_sampling_ratio/mean": 1.0192339420318604, + "sampling/importance_sampling_ratio/min": 0.3669769763946533, + "sampling/sampling_logp_difference/max": 0.6636786460876465, + "sampling/sampling_logp_difference/mean": 0.02793467789888382, + "step": 321, + "step_time": 52.20804076900822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 49.875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35425591468811035, + "epoch": 0.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0075650215148926, + "kl": 0.07112079858779907, + "learning_rate": 3.969463130731183e-06, + "loss": -0.2281, + "num_tokens": 1796411.0, + "reward": 0.48625001311302185, + "reward_std": 0.5877156257629395, + "rewards/reward_func/mean": 0.48625001311302185, + "rewards/reward_func/std": 0.5441622138023376, + "sampling/importance_sampling_ratio/max": 2.1321513652801514, + "sampling/importance_sampling_ratio/mean": 0.815255343914032, + "sampling/importance_sampling_ratio/min": 0.3492589294910431, + "sampling/sampling_logp_difference/max": 0.6331918239593506, + "sampling/sampling_logp_difference/mean": 0.02603982575237751, + "step": 322, + "step_time": 75.52292313199723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 44.125, + "completions/mean_terminated_length": 44.125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.31836074590682983, + "epoch": 0.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4238131046295166, + "kl": 0.046384476125240326, + "learning_rate": 3.962904914974656e-06, + "loss": 0.0372, + "num_tokens": 1801901.0, + "reward": 0.35374999046325684, + "reward_std": 0.5490298271179199, + "rewards/reward_func/mean": 0.35374999046325684, + "rewards/reward_func/std": 0.5245934128761292, + "sampling/importance_sampling_ratio/max": 1.4118305444717407, + "sampling/importance_sampling_ratio/mean": 0.8808070421218872, + "sampling/importance_sampling_ratio/min": 0.5332664847373962, + "sampling/sampling_logp_difference/max": 0.8431804180145264, + "sampling/sampling_logp_difference/mean": 0.028243277221918106, + "step": 323, + "step_time": 68.1251233840012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 43.25, + "completions/mean_terminated_length": 43.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.31607919931411743, + "epoch": 0.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3807119131088257, + "kl": 0.07309075444936752, + "learning_rate": 3.956331354037805e-06, + "loss": -0.0479, + "num_tokens": 1806905.0, + "reward": 0.21124999225139618, + "reward_std": 0.3054782450199127, + "rewards/reward_func/mean": 0.21124999225139618, + "rewards/reward_func/std": 0.4696028232574463, + "sampling/importance_sampling_ratio/max": 1.787608027458191, + "sampling/importance_sampling_ratio/mean": 1.0451674461364746, + "sampling/importance_sampling_ratio/min": 0.46881166100502014, + "sampling/sampling_logp_difference/max": 0.5253305435180664, + "sampling/sampling_logp_difference/mean": 0.029314052313566208, + "step": 324, + "step_time": 59.86747224899591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3600061535835266, + "epoch": 0.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.817853569984436, + "kl": 0.09967118501663208, + "learning_rate": 3.949742516874175e-06, + "loss": 0.2608, + "num_tokens": 1812735.0, + "reward": 0.20499999821186066, + "reward_std": 0.3137704133987427, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.47461265325546265, + "sampling/importance_sampling_ratio/max": 2.5692901611328125, + "sampling/importance_sampling_ratio/mean": 0.9470885992050171, + "sampling/importance_sampling_ratio/min": 0.3106057047843933, + "sampling/sampling_logp_difference/max": 0.8942482471466064, + "sampling/sampling_logp_difference/mean": 0.03815475106239319, + "step": 325, + "step_time": 61.724708480003756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 41.125, + "completions/mean_terminated_length": 41.125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3302639126777649, + "epoch": 0.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5889121294021606, + "kl": 0.032158948481082916, + "learning_rate": 3.943138472597549e-06, + "loss": 0.0395, + "num_tokens": 1817852.0, + "reward": 0.08499999344348907, + "reward_std": 0.28810441493988037, + "rewards/reward_func/mean": 0.08499999344348907, + "rewards/reward_func/std": 0.37232860922813416, + "sampling/importance_sampling_ratio/max": 2.248680830001831, + "sampling/importance_sampling_ratio/mean": 1.1071228981018066, + "sampling/importance_sampling_ratio/min": 0.4287269413471222, + "sampling/sampling_logp_difference/max": 0.4500095844268799, + "sampling/sampling_logp_difference/mean": 0.03271816670894623, + "step": 326, + "step_time": 78.85871165001299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 41.875, + "completions/mean_terminated_length": 41.875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.3419029712677002, + "epoch": 0.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6311590671539307, + "kl": 0.047403544187545776, + "learning_rate": 3.936519290481226e-06, + "loss": -0.2247, + "num_tokens": 1823582.0, + "reward": 0.19874998927116394, + "reward_std": 0.518446683883667, + "rewards/reward_func/mean": 0.19874998927116394, + "rewards/reward_func/std": 0.48034030199050903, + "sampling/importance_sampling_ratio/max": 1.9648873805999756, + "sampling/importance_sampling_ratio/mean": 1.09955632686615, + "sampling/importance_sampling_ratio/min": 0.5106386542320251, + "sampling/sampling_logp_difference/max": 0.47838133573532104, + "sampling/sampling_logp_difference/mean": 0.03344731032848358, + "step": 327, + "step_time": 74.20873203998781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.31387221813201904, + "epoch": 0.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6050386428833008, + "kl": 0.0334724560379982, + "learning_rate": 3.929885039957296e-06, + "loss": 0.1015, + "num_tokens": 1828698.0, + "reward": 0.1899999976158142, + "reward_std": 0.33716854453086853, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.4895770847797394, + "sampling/importance_sampling_ratio/max": 1.870469570159912, + "sampling/importance_sampling_ratio/mean": 0.8102731108665466, + "sampling/importance_sampling_ratio/min": 0.3841648995876312, + "sampling/sampling_logp_difference/max": 0.6972520351409912, + "sampling/sampling_logp_difference/mean": 0.02928170934319496, + "step": 328, + "step_time": 132.74387937001302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.33447951078414917, + "epoch": 0.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.379056453704834, + "kl": 0.03984800726175308, + "learning_rate": 3.923235790615907e-06, + "loss": -0.1686, + "num_tokens": 1834063.0, + "reward": 0.21125000715255737, + "reward_std": 0.506218671798706, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.46896353363990784, + "sampling/importance_sampling_ratio/max": 1.307706594467163, + "sampling/importance_sampling_ratio/mean": 0.8228154182434082, + "sampling/importance_sampling_ratio/min": 0.5563095211982727, + "sampling/sampling_logp_difference/max": 0.5035196542739868, + "sampling/sampling_logp_difference/mean": 0.025627177208662033, + "step": 329, + "step_time": 143.56957943798625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 39.125, + "completions/mean_terminated_length": 39.125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.3598230183124542, + "epoch": 0.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.37947416305542, + "kl": 0.0858326181769371, + "learning_rate": 3.916571612204538e-06, + "loss": -0.1299, + "num_tokens": 1839339.0, + "reward": 0.21250000596046448, + "reward_std": 0.5239397287368774, + "rewards/reward_func/mean": 0.21250000596046448, + "rewards/reward_func/std": 0.4850846827030182, + "sampling/importance_sampling_ratio/max": 1.8522554636001587, + "sampling/importance_sampling_ratio/mean": 1.0172133445739746, + "sampling/importance_sampling_ratio/min": 0.35823509097099304, + "sampling/sampling_logp_difference/max": 0.7279484272003174, + "sampling/sampling_logp_difference/mean": 0.03425194323062897, + "step": 330, + "step_time": 138.2842517439858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3195509910583496, + "epoch": 0.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0245273113250732, + "kl": 0.04566916078329086, + "learning_rate": 3.909892574627267e-06, + "loss": -0.0532, + "num_tokens": 1845149.0, + "reward": 0.3187499940395355, + "reward_std": 0.5860832333564758, + "rewards/reward_func/mean": 0.3187499940395355, + "rewards/reward_func/std": 0.56430584192276, + "sampling/importance_sampling_ratio/max": 2.2832202911376953, + "sampling/importance_sampling_ratio/mean": 1.1496918201446533, + "sampling/importance_sampling_ratio/min": 0.5304498672485352, + "sampling/sampling_logp_difference/max": 0.8081755638122559, + "sampling/sampling_logp_difference/mean": 0.028967654332518578, + "step": 331, + "step_time": 137.66897978598718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.32289984822273254, + "epoch": 0.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8185482025146484, + "kl": 0.10041403025388718, + "learning_rate": 3.903198747944037e-06, + "loss": 0.1168, + "num_tokens": 1850899.0, + "reward": 0.22624999284744263, + "reward_std": 0.3100869655609131, + "rewards/reward_func/mean": 0.22624999284744263, + "rewards/reward_func/std": 0.47853758931159973, + "sampling/importance_sampling_ratio/max": 1.338444471359253, + "sampling/importance_sampling_ratio/mean": 0.8065738677978516, + "sampling/importance_sampling_ratio/min": 0.3798188865184784, + "sampling/sampling_logp_difference/max": 0.8146078586578369, + "sampling/sampling_logp_difference/mean": 0.02689986675977707, + "step": 332, + "step_time": 131.41008436502307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3707420527935028, + "epoch": 0.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2363697290420532, + "kl": 0.03508942574262619, + "learning_rate": 3.896490202369924e-06, + "loss": 0.0616, + "num_tokens": 1856034.0, + "reward": 0.32124999165534973, + "reward_std": 0.5887609720230103, + "rewards/reward_func/mean": 0.32124999165534973, + "rewards/reward_func/std": 0.563393771648407, + "sampling/importance_sampling_ratio/max": 1.6524691581726074, + "sampling/importance_sampling_ratio/mean": 0.8412412405014038, + "sampling/importance_sampling_ratio/min": 0.23496931791305542, + "sampling/sampling_logp_difference/max": 0.6445038318634033, + "sampling/sampling_logp_difference/mean": 0.03339887410402298, + "step": 333, + "step_time": 111.69739279698115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.33553531765937805, + "epoch": 0.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9219703674316406, + "kl": 0.09176594018936157, + "learning_rate": 3.889767008274396e-06, + "loss": 0.2604, + "num_tokens": 1861621.0, + "reward": 0.3537500202655792, + "reward_std": 0.5339703559875488, + "rewards/reward_func/mean": 0.3537500202655792, + "rewards/reward_func/std": 0.5136407017707825, + "sampling/importance_sampling_ratio/max": 1.8493578433990479, + "sampling/importance_sampling_ratio/mean": 1.008284091949463, + "sampling/importance_sampling_ratio/min": 0.41284075379371643, + "sampling/sampling_logp_difference/max": 0.6564333438873291, + "sampling/sampling_logp_difference/mean": 0.025348259136080742, + "step": 334, + "step_time": 143.32291307201376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 39.625, + "completions/mean_terminated_length": 39.625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3513152301311493, + "epoch": 0.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9763464331626892, + "kl": 0.06704329699277878, + "learning_rate": 3.883029236180577e-06, + "loss": -0.1172, + "num_tokens": 1867778.0, + "reward": 0.35374999046325684, + "reward_std": 0.2704784870147705, + "rewards/reward_func/mean": 0.35374999046325684, + "rewards/reward_func/std": 0.5355620980262756, + "sampling/importance_sampling_ratio/max": 1.107627272605896, + "sampling/importance_sampling_ratio/mean": 0.7351757287979126, + "sampling/importance_sampling_ratio/min": 0.19063004851341248, + "sampling/sampling_logp_difference/max": 0.8456048965454102, + "sampling/sampling_logp_difference/mean": 0.033929385244846344, + "step": 335, + "step_time": 103.19867493197671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 41.875, + "completions/mean_terminated_length": 41.875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.31681621074676514, + "epoch": 0.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7861030101776123, + "kl": 0.03690392151474953, + "learning_rate": 3.876276956764509e-06, + "loss": 0.2589, + "num_tokens": 1872931.0, + "reward": 0.21375000476837158, + "reward_std": 0.32227829098701477, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.4870299994945526, + "sampling/importance_sampling_ratio/max": 2.2250986099243164, + "sampling/importance_sampling_ratio/mean": 1.0775128602981567, + "sampling/importance_sampling_ratio/min": 0.5108433961868286, + "sampling/sampling_logp_difference/max": 0.35452377796173096, + "sampling/sampling_logp_difference/mean": 0.026147497817873955, + "step": 336, + "step_time": 108.95734459199593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3189052939414978, + "epoch": 0.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9657090902328491, + "kl": 0.06692003458738327, + "learning_rate": 3.869510240854408e-06, + "loss": 0.1239, + "num_tokens": 1878410.0, + "reward": 0.32249999046325684, + "reward_std": 0.5750815868377686, + "rewards/reward_func/mean": 0.32249999046325684, + "rewards/reward_func/std": 0.5570265650749207, + "sampling/importance_sampling_ratio/max": 2.461669445037842, + "sampling/importance_sampling_ratio/mean": 1.2536345720291138, + "sampling/importance_sampling_ratio/min": 0.7026631236076355, + "sampling/sampling_logp_difference/max": 0.595012903213501, + "sampling/sampling_logp_difference/mean": 0.025707338005304337, + "step": 337, + "step_time": 101.68271847401047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3161134421825409, + "epoch": 0.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2771250009536743, + "kl": 0.017098795622587204, + "learning_rate": 3.862729159429921e-06, + "loss": -0.2422, + "num_tokens": 1883892.0, + "reward": 0.7250000238418579, + "reward_std": 0.49705883860588074, + "rewards/reward_func/mean": 0.7250000238418579, + "rewards/reward_func/std": 0.4603104889392853, + "sampling/importance_sampling_ratio/max": 2.6572251319885254, + "sampling/importance_sampling_ratio/mean": 1.1089489459991455, + "sampling/importance_sampling_ratio/min": 0.5457375645637512, + "sampling/sampling_logp_difference/max": 0.491180419921875, + "sampling/sampling_logp_difference/mean": 0.02168424054980278, + "step": 338, + "step_time": 114.0666631339991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 41.0, + "completions/mean_terminated_length": 41.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.396328866481781, + "epoch": 0.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4366711378097534, + "kl": 0.17721709609031677, + "learning_rate": 3.855933783621384e-06, + "loss": 0.1282, + "num_tokens": 1889200.0, + "reward": 0.30000001192092896, + "reward_std": 0.30565518140792847, + "rewards/reward_func/mean": 0.30000001192092896, + "rewards/reward_func/std": 0.5595406293869019, + "sampling/importance_sampling_ratio/max": 1.7435904741287231, + "sampling/importance_sampling_ratio/mean": 0.9623221158981323, + "sampling/importance_sampling_ratio/min": 0.3996666669845581, + "sampling/sampling_logp_difference/max": 0.7043921947479248, + "sampling/sampling_logp_difference/mean": 0.03383718058466911, + "step": 339, + "step_time": 126.46415012000944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3627287745475769, + "epoch": 0.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1164697408676147, + "kl": 0.04694604501128197, + "learning_rate": 3.849124184709073e-06, + "loss": 0.0417, + "num_tokens": 1894511.0, + "reward": 0.06749999523162842, + "reward_std": 0.2736448645591736, + "rewards/reward_func/mean": 0.06749999523162842, + "rewards/reward_func/std": 0.3591557443141937, + "sampling/importance_sampling_ratio/max": 1.6633166074752808, + "sampling/importance_sampling_ratio/mean": 1.0712683200836182, + "sampling/importance_sampling_ratio/min": 0.5509455800056458, + "sampling/sampling_logp_difference/max": 0.3140767812728882, + "sampling/sampling_logp_difference/mean": 0.02482220157980919, + "step": 340, + "step_time": 162.8545896350115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.31078046560287476, + "epoch": 0.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7845726013183594, + "kl": 0.04270056635141373, + "learning_rate": 3.84230043412246e-06, + "loss": -0.2732, + "num_tokens": 1900006.0, + "reward": 0.32625001668930054, + "reward_std": 0.5653538703918457, + "rewards/reward_func/mean": 0.32625001668930054, + "rewards/reward_func/std": 0.5494916439056396, + "sampling/importance_sampling_ratio/max": 2.5016562938690186, + "sampling/importance_sampling_ratio/mean": 1.0579283237457275, + "sampling/importance_sampling_ratio/min": 0.6514557003974915, + "sampling/sampling_logp_difference/max": 0.5734856128692627, + "sampling/sampling_logp_difference/mean": 0.029540089890360832, + "step": 341, + "step_time": 155.76240094099194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3381982445716858, + "epoch": 0.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8940988779067993, + "kl": 0.1022232174873352, + "learning_rate": 3.835462603439458e-06, + "loss": 0.1884, + "num_tokens": 1904962.0, + "reward": 0.21625001728534698, + "reward_std": 0.31474921107292175, + "rewards/reward_func/mean": 0.21625001728534698, + "rewards/reward_func/std": 0.4785973131656647, + "sampling/importance_sampling_ratio/max": 1.3884916305541992, + "sampling/importance_sampling_ratio/mean": 1.0323078632354736, + "sampling/importance_sampling_ratio/min": 0.5798347592353821, + "sampling/sampling_logp_difference/max": 0.6027919054031372, + "sampling/sampling_logp_difference/mean": 0.026107758283615112, + "step": 342, + "step_time": 166.86433797102654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 40.625, + "completions/mean_terminated_length": 40.625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.33497440814971924, + "epoch": 0.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1610031127929688, + "kl": 0.09758682548999786, + "learning_rate": 3.828610764385676e-06, + "loss": -0.0412, + "num_tokens": 1911022.0, + "reward": -0.07250000536441803, + "reward_std": 0.054318200796842575, + "rewards/reward_func/mean": -0.07250000536441803, + "rewards/reward_func/std": 0.054967526346445084, + "sampling/importance_sampling_ratio/max": 1.8711848258972168, + "sampling/importance_sampling_ratio/mean": 1.0269144773483276, + "sampling/importance_sampling_ratio/min": 0.13635054230690002, + "sampling/sampling_logp_difference/max": 1.1250584125518799, + "sampling/sampling_logp_difference/mean": 0.0336228646337986, + "step": 343, + "step_time": 180.27627392599243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.2781206965446472, + "epoch": 0.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1632080078125, + "kl": 0.10593483597040176, + "learning_rate": 3.821744988833664e-06, + "loss": 0.0054, + "num_tokens": 1916625.0, + "reward": 0.3199999928474426, + "reward_std": 0.5272395610809326, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.5035871863365173, + "sampling/importance_sampling_ratio/max": 1.4332565069198608, + "sampling/importance_sampling_ratio/mean": 0.9319165945053101, + "sampling/importance_sampling_ratio/min": 0.38523051142692566, + "sampling/sampling_logp_difference/max": 0.8005368709564209, + "sampling/sampling_logp_difference/mean": 0.020923875272274017, + "step": 344, + "step_time": 169.41366141600884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 39.375, + "completions/mean_terminated_length": 39.375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.2810716927051544, + "epoch": 0.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4414160251617432, + "kl": 0.06682641804218292, + "learning_rate": 3.814865348802157e-06, + "loss": -0.2297, + "num_tokens": 1921399.0, + "reward": 0.21000000834465027, + "reward_std": 0.5288327932357788, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.48986876010894775, + "sampling/importance_sampling_ratio/max": 2.1541748046875, + "sampling/importance_sampling_ratio/mean": 1.0254625082015991, + "sampling/importance_sampling_ratio/min": 0.4324725568294525, + "sampling/sampling_logp_difference/max": 0.8203954696655273, + "sampling/sampling_logp_difference/mean": 0.026356138288974762, + "step": 345, + "step_time": 129.16487764098565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 42.5, + "completions/mean_terminated_length": 42.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.34982192516326904, + "epoch": 0.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2038586139678955, + "kl": 0.16085557639598846, + "learning_rate": 3.807971916455325e-06, + "loss": 0.0815, + "num_tokens": 1926202.0, + "reward": 0.07124999910593033, + "reward_std": 0.27993497252464294, + "rewards/reward_func/mean": 0.07124999910593033, + "rewards/reward_func/std": 0.3617590665817261, + "sampling/importance_sampling_ratio/max": 1.7573891878128052, + "sampling/importance_sampling_ratio/mean": 0.9032962322235107, + "sampling/importance_sampling_ratio/min": 0.29005834460258484, + "sampling/sampling_logp_difference/max": 1.3162736892700195, + "sampling/sampling_logp_difference/mean": 0.03286924958229065, + "step": 346, + "step_time": 146.0007931359869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3232361674308777, + "epoch": 0.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0941061973571777, + "kl": 0.026414429768919945, + "learning_rate": 3.8010647641020116e-06, + "loss": 0.1266, + "num_tokens": 1931733.0, + "reward": 0.5987499952316284, + "reward_std": 0.5347613096237183, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.5155701041221619, + "sampling/importance_sampling_ratio/max": 1.3389661312103271, + "sampling/importance_sampling_ratio/mean": 0.6809048056602478, + "sampling/importance_sampling_ratio/min": 0.19245320558547974, + "sampling/sampling_logp_difference/max": 1.0144225358963013, + "sampling/sampling_logp_difference/mean": 0.02918568253517151, + "step": 347, + "step_time": 71.32271579199005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3079647123813629, + "epoch": 0.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.154685139656067, + "kl": 0.09130249172449112, + "learning_rate": 3.794143964194976e-06, + "loss": -0.0868, + "num_tokens": 1936951.0, + "reward": 0.46250003576278687, + "reward_std": 0.5227590799331665, + "rewards/reward_func/mean": 0.46250003576278687, + "rewards/reward_func/std": 0.5515885949134827, + "sampling/importance_sampling_ratio/max": 1.4307719469070435, + "sampling/importance_sampling_ratio/mean": 0.8543438911437988, + "sampling/importance_sampling_ratio/min": 0.3845389783382416, + "sampling/sampling_logp_difference/max": 0.6325764656066895, + "sampling/sampling_logp_difference/mean": 0.03074759989976883, + "step": 348, + "step_time": 61.10349588000099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.32811909914016724, + "epoch": 0.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4756181240081787, + "kl": 0.07114064693450928, + "learning_rate": 3.7872095893301344e-06, + "loss": 0.232, + "num_tokens": 1942770.0, + "reward": 0.3100000023841858, + "reward_std": 0.31517019867897034, + "rewards/reward_func/mean": 0.3100000023841858, + "rewards/reward_func/std": 0.5469656586647034, + "sampling/importance_sampling_ratio/max": 1.6810601949691772, + "sampling/importance_sampling_ratio/mean": 0.9875794649124146, + "sampling/importance_sampling_ratio/min": 0.3065728545188904, + "sampling/sampling_logp_difference/max": 0.4949173927307129, + "sampling/sampling_logp_difference/mean": 0.025241130962967873, + "step": 349, + "step_time": 48.66686181901605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 42.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3392306864261627, + "epoch": 0.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.145804524421692, + "kl": 0.03867422789335251, + "learning_rate": 3.7802617122457976e-06, + "loss": 0.0707, + "num_tokens": 1948625.0, + "reward": 0.09000000357627869, + "reward_std": 0.2660285234451294, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.3568112850189209, + "sampling/importance_sampling_ratio/max": 1.6579951047897339, + "sampling/importance_sampling_ratio/mean": 1.0459256172180176, + "sampling/importance_sampling_ratio/min": 0.5915707945823669, + "sampling/sampling_logp_difference/max": 0.4812997579574585, + "sampling/sampling_logp_difference/mean": 0.02722543105483055, + "step": 350, + "step_time": 76.36607002699748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3211192488670349, + "epoch": 0.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0469977855682373, + "kl": 0.027719926089048386, + "learning_rate": 3.773300405821908e-06, + "loss": -0.0345, + "num_tokens": 1954448.0, + "reward": 0.32375001907348633, + "reward_std": 0.281686395406723, + "rewards/reward_func/mean": 0.32375001907348633, + "rewards/reward_func/std": 0.527769923210144, + "sampling/importance_sampling_ratio/max": 1.9723066091537476, + "sampling/importance_sampling_ratio/mean": 1.1106541156768799, + "sampling/importance_sampling_ratio/min": 0.5282062292098999, + "sampling/sampling_logp_difference/max": 0.3540763854980469, + "sampling/sampling_logp_difference/mean": 0.02676708996295929, + "step": 351, + "step_time": 78.26852857100312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3046337366104126, + "epoch": 0.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.208288550376892, + "kl": 0.03900061175227165, + "learning_rate": 3.766325743079277e-06, + "loss": -0.1125, + "num_tokens": 1959253.0, + "reward": 0.48375001549720764, + "reward_std": 0.5962894558906555, + "rewards/reward_func/mean": 0.48375001549720764, + "rewards/reward_func/std": 0.5521371364593506, + "sampling/importance_sampling_ratio/max": 1.670057773590088, + "sampling/importance_sampling_ratio/mean": 0.9210529327392578, + "sampling/importance_sampling_ratio/min": 0.5158092379570007, + "sampling/sampling_logp_difference/max": 0.432373046875, + "sampling/sampling_logp_difference/mean": 0.026169460266828537, + "step": 352, + "step_time": 46.398978653975064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 41.0, + "completions/mean_terminated_length": 41.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.31318768858909607, + "epoch": 0.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.743395209312439, + "kl": 0.0346795991063118, + "learning_rate": 3.7593377971788162e-06, + "loss": 0.1768, + "num_tokens": 1964058.0, + "reward": 0.22500000894069672, + "reward_std": 0.31266871094703674, + "rewards/reward_func/mean": 0.22500000894069672, + "rewards/reward_func/std": 0.4766250550746918, + "sampling/importance_sampling_ratio/max": 1.8619552850723267, + "sampling/importance_sampling_ratio/mean": 1.091329574584961, + "sampling/importance_sampling_ratio/min": 0.28429219126701355, + "sampling/sampling_logp_difference/max": 0.7750775814056396, + "sampling/sampling_logp_difference/mean": 0.031623724848032, + "step": 353, + "step_time": 58.123137532005785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.35126036405563354, + "epoch": 0.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4767982959747314, + "kl": 0.033217888325452805, + "learning_rate": 3.752336641420772e-06, + "loss": 0.0776, + "num_tokens": 1968954.0, + "reward": 0.058750007301568985, + "reward_std": 0.2922004461288452, + "rewards/reward_func/mean": 0.058750007301568985, + "rewards/reward_func/std": 0.3824521601200104, + "sampling/importance_sampling_ratio/max": 1.434964895248413, + "sampling/importance_sampling_ratio/mean": 1.084316372871399, + "sampling/importance_sampling_ratio/min": 0.5846006870269775, + "sampling/sampling_logp_difference/max": 0.4179229736328125, + "sampling/sampling_logp_difference/mean": 0.0240701362490654, + "step": 354, + "step_time": 59.292683080013376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.328327476978302, + "epoch": 0.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.247985601425171, + "kl": 0.08559633791446686, + "learning_rate": 3.7453223492439544e-06, + "loss": 0.0737, + "num_tokens": 1975108.0, + "reward": 0.4699999988079071, + "reward_std": 0.5927736163139343, + "rewards/reward_func/mean": 0.4699999988079071, + "rewards/reward_func/std": 0.5489730834960938, + "sampling/importance_sampling_ratio/max": 1.4081599712371826, + "sampling/importance_sampling_ratio/mean": 0.8387018442153931, + "sampling/importance_sampling_ratio/min": 0.19741253554821014, + "sampling/sampling_logp_difference/max": 0.9286923408508301, + "sampling/sampling_logp_difference/mean": 0.03092752769589424, + "step": 355, + "step_time": 75.66988031598157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.28474941849708557, + "epoch": 0.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.736219882965088, + "kl": 0.04625285789370537, + "learning_rate": 3.7382949942249695e-06, + "loss": 0.2333, + "num_tokens": 1980329.0, + "reward": 0.3212500214576721, + "reward_std": 0.5408031344413757, + "rewards/reward_func/mean": 0.3212500214576721, + "rewards/reward_func/std": 0.5195723176002502, + "sampling/importance_sampling_ratio/max": 2.1159582138061523, + "sampling/importance_sampling_ratio/mean": 1.136248230934143, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6177792549133301, + "sampling/sampling_logp_difference/mean": 0.02496938779950142, + "step": 356, + "step_time": 65.68752502801362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.32189249992370605, + "epoch": 0.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1041979789733887, + "kl": 0.060013748705387115, + "learning_rate": 3.731254650077446e-06, + "loss": -0.0802, + "num_tokens": 1985708.0, + "reward": 0.44624999165534973, + "reward_std": 0.6114711165428162, + "rewards/reward_func/mean": 0.44624999165534973, + "rewards/reward_func/std": 0.5663905143737793, + "sampling/importance_sampling_ratio/max": 1.5184648036956787, + "sampling/importance_sampling_ratio/mean": 0.783623218536377, + "sampling/importance_sampling_ratio/min": 0.3887534737586975, + "sampling/sampling_logp_difference/max": 0.5213687419891357, + "sampling/sampling_logp_difference/mean": 0.025201398879289627, + "step": 357, + "step_time": 61.06893451101496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 44.625, + "completions/mean_terminated_length": 44.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34849998354911804, + "epoch": 0.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8569178581237793, + "kl": 0.10115896165370941, + "learning_rate": 3.724201390651263e-06, + "loss": 0.0172, + "num_tokens": 1991176.0, + "reward": 0.0912499949336052, + "reward_std": 0.27447310090065, + "rewards/reward_func/mean": 0.0912499949336052, + "rewards/reward_func/std": 0.36868250370025635, + "sampling/importance_sampling_ratio/max": 2.399606943130493, + "sampling/importance_sampling_ratio/mean": 1.0813590288162231, + "sampling/importance_sampling_ratio/min": 0.29478445649147034, + "sampling/sampling_logp_difference/max": 1.0274195671081543, + "sampling/sampling_logp_difference/mean": 0.03125939890742302, + "step": 358, + "step_time": 74.65547078498639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.36881792545318604, + "epoch": 0.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5232397317886353, + "kl": 0.06990374624729156, + "learning_rate": 3.7171352899317743e-06, + "loss": 0.0231, + "num_tokens": 1997445.0, + "reward": 0.2199999988079071, + "reward_std": 0.3053818643093109, + "rewards/reward_func/mean": 0.2199999988079071, + "rewards/reward_func/std": 0.4732864201068878, + "sampling/importance_sampling_ratio/max": 1.7546019554138184, + "sampling/importance_sampling_ratio/mean": 0.9471590518951416, + "sampling/importance_sampling_ratio/min": 0.46070781350135803, + "sampling/sampling_logp_difference/max": 0.7129201889038086, + "sampling/sampling_logp_difference/mean": 0.028827045112848282, + "step": 359, + "step_time": 74.88154268401559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.36909693479537964, + "epoch": 0.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8818954825401306, + "kl": 0.040109992027282715, + "learning_rate": 3.710056422039033e-06, + "loss": 0.2106, + "num_tokens": 2003046.0, + "reward": 0.3062500059604645, + "reward_std": 0.5747673511505127, + "rewards/reward_func/mean": 0.3062500059604645, + "rewards/reward_func/std": 0.5578514337539673, + "sampling/importance_sampling_ratio/max": 2.0071616172790527, + "sampling/importance_sampling_ratio/mean": 1.0366851091384888, + "sampling/importance_sampling_ratio/min": 0.5076926946640015, + "sampling/sampling_logp_difference/max": 0.45261478424072266, + "sampling/sampling_logp_difference/mean": 0.028059128671884537, + "step": 360, + "step_time": 67.58852625099826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.37454652786254883, + "epoch": 0.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1584450006484985, + "kl": 0.01666010171175003, + "learning_rate": 3.702964861227013e-06, + "loss": 0.0801, + "num_tokens": 2008281.0, + "reward": -0.08624999970197678, + "reward_std": 0.0722728967666626, + "rewards/reward_func/mean": -0.08624999970197678, + "rewards/reward_func/std": 0.06781013309955597, + "sampling/importance_sampling_ratio/max": 1.410200595855713, + "sampling/importance_sampling_ratio/mean": 0.9846078753471375, + "sampling/importance_sampling_ratio/min": 0.7172226309776306, + "sampling/sampling_logp_difference/max": 0.46905517578125, + "sampling/sampling_logp_difference/mean": 0.025215893983840942, + "step": 361, + "step_time": 91.82594713801518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 39.375, + "completions/mean_terminated_length": 39.375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3056425154209137, + "epoch": 0.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8975749015808105, + "kl": 0.11677496135234833, + "learning_rate": 3.695860681882832e-06, + "loss": 0.0079, + "num_tokens": 2014004.0, + "reward": 0.4437500238418579, + "reward_std": 0.6349660754203796, + "rewards/reward_func/mean": 0.4437500238418579, + "rewards/reward_func/std": 0.5882161259651184, + "sampling/importance_sampling_ratio/max": 2.2386791706085205, + "sampling/importance_sampling_ratio/mean": 1.0769392251968384, + "sampling/importance_sampling_ratio/min": 0.508983850479126, + "sampling/sampling_logp_difference/max": 0.8052873611450195, + "sampling/sampling_logp_difference/mean": 0.029708731919527054, + "step": 362, + "step_time": 70.66860884500784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3323149085044861, + "epoch": 0.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.601610541343689, + "kl": 0.044327329844236374, + "learning_rate": 3.6887439585259693e-06, + "loss": -0.0394, + "num_tokens": 2019115.0, + "reward": 0.19749999046325684, + "reward_std": 0.5344071984291077, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.49485206604003906, + "sampling/importance_sampling_ratio/max": 1.6172147989273071, + "sampling/importance_sampling_ratio/mean": 1.0461113452911377, + "sampling/importance_sampling_ratio/min": 0.661210834980011, + "sampling/sampling_logp_difference/max": 0.423846960067749, + "sampling/sampling_logp_difference/mean": 0.02795753814280033, + "step": 363, + "step_time": 64.47163575098966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3601047396659851, + "epoch": 0.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0017635822296143, + "kl": 0.026227379217743874, + "learning_rate": 3.6816147658074864e-06, + "loss": 0.0791, + "num_tokens": 2024411.0, + "reward": 0.2212499976158142, + "reward_std": 0.5133354663848877, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.4760383367538452, + "sampling/importance_sampling_ratio/max": 1.4538358449935913, + "sampling/importance_sampling_ratio/mean": 0.9334630966186523, + "sampling/importance_sampling_ratio/min": 0.5542329549789429, + "sampling/sampling_logp_difference/max": 0.3732813596725464, + "sampling/sampling_logp_difference/mean": 0.02371375635266304, + "step": 364, + "step_time": 62.915858155989554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.33008331060409546, + "epoch": 0.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4119970798492432, + "kl": 0.02246996760368347, + "learning_rate": 3.6744731785092396e-06, + "loss": 0.1875, + "num_tokens": 2029629.0, + "reward": 0.4650000333786011, + "reward_std": 0.4775117039680481, + "rewards/reward_func/mean": 0.4650000333786011, + "rewards/reward_func/std": 0.5166375041007996, + "sampling/importance_sampling_ratio/max": 1.4311546087265015, + "sampling/importance_sampling_ratio/mean": 0.8069183230400085, + "sampling/importance_sampling_ratio/min": 0.30695483088493347, + "sampling/sampling_logp_difference/max": 0.8008233308792114, + "sampling/sampling_logp_difference/mean": 0.028506487607955933, + "step": 365, + "step_time": 69.22173458198085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34061259031295776, + "epoch": 0.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.749092161655426, + "kl": 0.06381135433912277, + "learning_rate": 3.6673192715431016e-06, + "loss": 0.1062, + "num_tokens": 2035390.0, + "reward": 0.3412500023841858, + "reward_std": 0.5657950639724731, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.5442803502082825, + "sampling/importance_sampling_ratio/max": 1.2575000524520874, + "sampling/importance_sampling_ratio/mean": 0.7373183965682983, + "sampling/importance_sampling_ratio/min": 0.2651961147785187, + "sampling/sampling_logp_difference/max": 0.8941724300384521, + "sampling/sampling_logp_difference/mean": 0.024856336414813995, + "step": 366, + "step_time": 61.59983134100912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3267011046409607, + "epoch": 0.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0291807651519775, + "kl": 0.0357687771320343, + "learning_rate": 3.6601531199501715e-06, + "loss": 0.0779, + "num_tokens": 2041220.0, + "reward": 0.33000001311302185, + "reward_std": 0.5733025074005127, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5529143214225769, + "sampling/importance_sampling_ratio/max": 1.1229214668273926, + "sampling/importance_sampling_ratio/mean": 0.8919734954833984, + "sampling/importance_sampling_ratio/min": 0.6937599778175354, + "sampling/sampling_logp_difference/max": 0.5363889932632446, + "sampling/sampling_logp_difference/mean": 0.029397767037153244, + "step": 367, + "step_time": 66.55807171101333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.4076734185218811, + "epoch": 0.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5241880416870117, + "kl": 0.04465167969465256, + "learning_rate": 3.652974798899988e-06, + "loss": -0.1161, + "num_tokens": 2047319.0, + "reward": 0.3499999940395355, + "reward_std": 0.5534278154373169, + "rewards/reward_func/mean": 0.3499999940395355, + "rewards/reward_func/std": 0.5284478664398193, + "sampling/importance_sampling_ratio/max": 1.7426992654800415, + "sampling/importance_sampling_ratio/mean": 0.89775550365448, + "sampling/importance_sampling_ratio/min": 0.49534907937049866, + "sampling/sampling_logp_difference/max": 0.423353910446167, + "sampling/sampling_logp_difference/mean": 0.027613524347543716, + "step": 368, + "step_time": 61.087412825989304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.31100600957870483, + "epoch": 0.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9741218090057373, + "kl": 0.041330281645059586, + "learning_rate": 3.645784383689742e-06, + "loss": -0.0427, + "num_tokens": 2052270.0, + "reward": 0.45249998569488525, + "reward_std": 0.6035691499710083, + "rewards/reward_func/mean": 0.45249998569488525, + "rewards/reward_func/std": 0.5591256618499756, + "sampling/importance_sampling_ratio/max": 1.8154767751693726, + "sampling/importance_sampling_ratio/mean": 1.2792425155639648, + "sampling/importance_sampling_ratio/min": 0.7800292372703552, + "sampling/sampling_logp_difference/max": 0.3694136142730713, + "sampling/sampling_logp_difference/mean": 0.022285200655460358, + "step": 369, + "step_time": 50.397062509000534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3526594042778015, + "epoch": 0.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3253847360610962, + "kl": 0.1274574100971222, + "learning_rate": 3.6385819497434877e-06, + "loss": -0.0685, + "num_tokens": 2057269.0, + "reward": 0.33375000953674316, + "reward_std": 0.5655902028083801, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5493616461753845, + "sampling/importance_sampling_ratio/max": 1.402198076248169, + "sampling/importance_sampling_ratio/mean": 0.8689178824424744, + "sampling/importance_sampling_ratio/min": 0.3067050278186798, + "sampling/sampling_logp_difference/max": 0.9296143054962158, + "sampling/sampling_logp_difference/mean": 0.026013534516096115, + "step": 370, + "step_time": 54.17731352400733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.32903480529785156, + "epoch": 0.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6364682912826538, + "kl": 0.044946420937776566, + "learning_rate": 3.631367572611348e-06, + "loss": -0.2922, + "num_tokens": 2063722.0, + "reward": 0.3500000238418579, + "reward_std": 0.5495222806930542, + "rewards/reward_func/mean": 0.3500000238418579, + "rewards/reward_func/std": 0.5277445316314697, + "sampling/importance_sampling_ratio/max": 1.5483042001724243, + "sampling/importance_sampling_ratio/mean": 0.8218961954116821, + "sampling/importance_sampling_ratio/min": 0.41721194982528687, + "sampling/sampling_logp_difference/max": 0.5305330753326416, + "sampling/sampling_logp_difference/mean": 0.028160959482192993, + "step": 371, + "step_time": 61.68605206900975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 44.25, + "completions/mean_terminated_length": 44.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.31724613904953003, + "epoch": 0.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3481601476669312, + "kl": 0.06961038708686829, + "learning_rate": 3.6241413279687256e-06, + "loss": 0.2308, + "num_tokens": 2069668.0, + "reward": 0.20499999821186066, + "reward_std": 0.3475438058376312, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.4931531250476837, + "sampling/importance_sampling_ratio/max": 1.3638176918029785, + "sampling/importance_sampling_ratio/mean": 0.7686522006988525, + "sampling/importance_sampling_ratio/min": 0.18426480889320374, + "sampling/sampling_logp_difference/max": 0.6221842765808105, + "sampling/sampling_logp_difference/mean": 0.032336391508579254, + "step": 372, + "step_time": 73.30788465001388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 49.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.29763156175613403, + "epoch": 0.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0755096673965454, + "kl": 0.02788379229605198, + "learning_rate": 3.616903291615506e-06, + "loss": 0.0231, + "num_tokens": 2074693.0, + "reward": 0.3149999976158142, + "reward_std": 0.5274383425712585, + "rewards/reward_func/mean": 0.3149999976158142, + "rewards/reward_func/std": 0.5020813345909119, + "sampling/importance_sampling_ratio/max": 1.2642863988876343, + "sampling/importance_sampling_ratio/mean": 0.8364578485488892, + "sampling/importance_sampling_ratio/min": 0.37059077620506287, + "sampling/sampling_logp_difference/max": 0.4319629669189453, + "sampling/sampling_logp_difference/mean": 0.026207586750388145, + "step": 373, + "step_time": 66.48929881799268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35318872332572937, + "epoch": 0.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5173914432525635, + "kl": 0.024975910782814026, + "learning_rate": 3.609653539475268e-06, + "loss": -0.0445, + "num_tokens": 2080341.0, + "reward": 0.3137499988079071, + "reward_std": 0.3301275372505188, + "rewards/reward_func/mean": 0.3137499988079071, + "rewards/reward_func/std": 0.5643184781074524, + "sampling/importance_sampling_ratio/max": 1.3749171495437622, + "sampling/importance_sampling_ratio/mean": 0.8896816372871399, + "sampling/importance_sampling_ratio/min": 0.5193299651145935, + "sampling/sampling_logp_difference/max": 0.5717992782592773, + "sampling/sampling_logp_difference/mean": 0.030124176293611526, + "step": 374, + "step_time": 78.54219227202702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.36500370502471924, + "epoch": 0.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6961071491241455, + "kl": 0.21644490957260132, + "learning_rate": 3.6023921475944795e-06, + "loss": 0.2398, + "num_tokens": 2085708.0, + "reward": 0.19875000417232513, + "reward_std": 0.517146110534668, + "rewards/reward_func/mean": 0.19875000417232513, + "rewards/reward_func/std": 0.47908952832221985, + "sampling/importance_sampling_ratio/max": 2.1243157386779785, + "sampling/importance_sampling_ratio/mean": 0.9670206904411316, + "sampling/importance_sampling_ratio/min": 0.29750171303749084, + "sampling/sampling_logp_difference/max": 1.0457005500793457, + "sampling/sampling_logp_difference/mean": 0.03371373564004898, + "step": 375, + "step_time": 66.68903090100503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.38407090306282043, + "epoch": 0.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7444822788238525, + "kl": 0.07864461094141006, + "learning_rate": 3.5951191921417063e-06, + "loss": -0.0054, + "num_tokens": 2091007.0, + "reward": 0.3712500035762787, + "reward_std": 0.5389498472213745, + "rewards/reward_func/mean": 0.3712500035762787, + "rewards/reward_func/std": 0.5179474949836731, + "sampling/importance_sampling_ratio/max": 1.3411577939987183, + "sampling/importance_sampling_ratio/mean": 0.9035917520523071, + "sampling/importance_sampling_ratio/min": 0.5722795128822327, + "sampling/sampling_logp_difference/max": 0.6103904247283936, + "sampling/sampling_logp_difference/mean": 0.04205818474292755, + "step": 376, + "step_time": 62.62372294199304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.375, + "completions/mean_terminated_length": 45.375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3571351170539856, + "epoch": 0.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4393010139465332, + "kl": 0.05092768371105194, + "learning_rate": 3.5878347494068083e-06, + "loss": 0.0737, + "num_tokens": 2096885.0, + "reward": -0.08124999701976776, + "reward_std": 0.05413114279508591, + "rewards/reward_func/mean": -0.08124999701976776, + "rewards/reward_func/std": 0.05617256462574005, + "sampling/importance_sampling_ratio/max": 1.5597022771835327, + "sampling/importance_sampling_ratio/mean": 1.0888936519622803, + "sampling/importance_sampling_ratio/min": 0.7159003019332886, + "sampling/sampling_logp_difference/max": 0.7558160424232483, + "sampling/sampling_logp_difference/mean": 0.028478611260652542, + "step": 377, + "step_time": 78.95239180698991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 44.125, + "completions/mean_terminated_length": 44.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3541492819786072, + "epoch": 0.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.104987382888794, + "kl": 0.06759263575077057, + "learning_rate": 3.580538895800144e-06, + "loss": -0.0204, + "num_tokens": 2102217.0, + "reward": 0.19624999165534973, + "reward_std": 0.5378745794296265, + "rewards/reward_func/mean": 0.19624999165534973, + "rewards/reward_func/std": 0.49805158376693726, + "sampling/importance_sampling_ratio/max": 0.9906109571456909, + "sampling/importance_sampling_ratio/mean": 0.7715339660644531, + "sampling/importance_sampling_ratio/min": 0.558393657207489, + "sampling/sampling_logp_difference/max": 0.5296880006790161, + "sampling/sampling_logp_difference/mean": 0.026829030364751816, + "step": 378, + "step_time": 68.67950404499425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3333103060722351, + "epoch": 0.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.306211233139038, + "kl": 0.04187663272023201, + "learning_rate": 3.573231707851765e-06, + "loss": -0.0646, + "num_tokens": 2108035.0, + "reward": 0.4725000262260437, + "reward_std": 0.5237306356430054, + "rewards/reward_func/mean": 0.4725000262260437, + "rewards/reward_func/std": 0.5537598729133606, + "sampling/importance_sampling_ratio/max": 1.1231999397277832, + "sampling/importance_sampling_ratio/mean": 0.8123407363891602, + "sampling/importance_sampling_ratio/min": 0.6417423486709595, + "sampling/sampling_logp_difference/max": 0.675841212272644, + "sampling/sampling_logp_difference/mean": 0.029995568096637726, + "step": 379, + "step_time": 58.079839242011076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 42.125, + "completions/mean_terminated_length": 42.125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.33336013555526733, + "epoch": 0.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.535415530204773, + "kl": 0.06656455248594284, + "learning_rate": 3.5659132622106152e-06, + "loss": -0.1762, + "num_tokens": 2113701.0, + "reward": 0.0637500062584877, + "reward_std": 0.2779829502105713, + "rewards/reward_func/mean": 0.0637500062584877, + "rewards/reward_func/std": 0.37591552734375, + "sampling/importance_sampling_ratio/max": 2.21850848197937, + "sampling/importance_sampling_ratio/mean": 1.0035760402679443, + "sampling/importance_sampling_ratio/min": 0.36623576283454895, + "sampling/sampling_logp_difference/max": 0.5119132995605469, + "sampling/sampling_logp_difference/mean": 0.03865154832601547, + "step": 380, + "step_time": 83.85681084700627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 41.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.34765076637268066, + "epoch": 0.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4875462055206299, + "kl": 0.05875123292207718, + "learning_rate": 3.5585836356437266e-06, + "loss": 0.0993, + "num_tokens": 2118822.0, + "reward": 0.05499999597668648, + "reward_std": 0.28776630759239197, + "rewards/reward_func/mean": 0.05499999597668648, + "rewards/reward_func/std": 0.37928506731987, + "sampling/importance_sampling_ratio/max": 2.0379128456115723, + "sampling/importance_sampling_ratio/mean": 1.3014514446258545, + "sampling/importance_sampling_ratio/min": 0.6343486309051514, + "sampling/sampling_logp_difference/max": 0.4492349624633789, + "sampling/sampling_logp_difference/mean": 0.026231329888105392, + "step": 381, + "step_time": 78.49495024702628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3074229955673218, + "epoch": 0.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0164070129394531, + "kl": 0.019709967076778412, + "learning_rate": 3.551242905035412e-06, + "loss": -0.1317, + "num_tokens": 2125216.0, + "reward": 0.08374999463558197, + "reward_std": 0.2835198938846588, + "rewards/reward_func/mean": 0.08374999463558197, + "rewards/reward_func/std": 0.37217265367507935, + "sampling/importance_sampling_ratio/max": 1.2361012697219849, + "sampling/importance_sampling_ratio/mean": 0.9302610158920288, + "sampling/importance_sampling_ratio/min": 0.6827021241188049, + "sampling/sampling_logp_difference/max": 0.3573673963546753, + "sampling/sampling_logp_difference/mean": 0.02304799109697342, + "step": 382, + "step_time": 84.3939113280212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3727479577064514, + "epoch": 0.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0571773052215576, + "kl": 0.09017743915319443, + "learning_rate": 3.5438911473864633e-06, + "loss": -0.2174, + "num_tokens": 2131334.0, + "reward": 0.0962500050663948, + "reward_std": 0.2652566134929657, + "rewards/reward_func/mean": 0.0962500050663948, + "rewards/reward_func/std": 0.35399505496025085, + "sampling/importance_sampling_ratio/max": 2.516140937805176, + "sampling/importance_sampling_ratio/mean": 1.00909423828125, + "sampling/importance_sampling_ratio/min": 0.6436149477958679, + "sampling/sampling_logp_difference/max": 0.5928447246551514, + "sampling/sampling_logp_difference/mean": 0.03408171236515045, + "step": 383, + "step_time": 73.35666742300964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3380432724952698, + "epoch": 0.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5185048580169678, + "kl": 0.05075772851705551, + "learning_rate": 3.5365284398133404e-06, + "loss": 0.1199, + "num_tokens": 2136480.0, + "reward": 0.30000001192092896, + "reward_std": 0.5517951250076294, + "rewards/reward_func/mean": 0.30000001192092896, + "rewards/reward_func/std": 0.5378262996673584, + "sampling/importance_sampling_ratio/max": 2.533414125442505, + "sampling/importance_sampling_ratio/mean": 0.987388014793396, + "sampling/importance_sampling_ratio/min": 0.2914103865623474, + "sampling/sampling_logp_difference/max": 0.4989492893218994, + "sampling/sampling_logp_difference/mean": 0.03067699819803238, + "step": 384, + "step_time": 55.22710300600738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3242243528366089, + "epoch": 0.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.345138669013977, + "kl": 0.039375655353069305, + "learning_rate": 3.52915485954736e-06, + "loss": 0.0886, + "num_tokens": 2141751.0, + "reward": 0.4362500011920929, + "reward_std": 0.5983107686042786, + "rewards/reward_func/mean": 0.4362500011920929, + "rewards/reward_func/std": 0.5539711117744446, + "sampling/importance_sampling_ratio/max": 1.7134405374526978, + "sampling/importance_sampling_ratio/mean": 1.042137861251831, + "sampling/importance_sampling_ratio/min": 0.4775417149066925, + "sampling/sampling_logp_difference/max": 0.5579397678375244, + "sampling/sampling_logp_difference/mean": 0.025659702718257904, + "step": 385, + "step_time": 67.54349041997921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 37.125, + "completions/mean_terminated_length": 37.125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.33516746759414673, + "epoch": 0.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.208026170730591, + "kl": 0.05478062853217125, + "learning_rate": 3.521770483933891e-06, + "loss": 0.2502, + "num_tokens": 2146979.0, + "reward": -0.0637499988079071, + "reward_std": 0.04775945842266083, + "rewards/reward_func/mean": -0.0637499988079071, + "rewards/reward_func/std": 0.050409041345119476, + "sampling/importance_sampling_ratio/max": 1.972628116607666, + "sampling/importance_sampling_ratio/mean": 1.1597208976745605, + "sampling/importance_sampling_ratio/min": 0.6736937165260315, + "sampling/sampling_logp_difference/max": 0.4482576847076416, + "sampling/sampling_logp_difference/mean": 0.027906980365514755, + "step": 386, + "step_time": 74.46757221099688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.37138980627059937, + "epoch": 0.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9289368391036987, + "kl": 0.10337759554386139, + "learning_rate": 3.514375390431539e-06, + "loss": 0.172, + "num_tokens": 2153373.0, + "reward": 0.2900000214576721, + "reward_std": 0.5930180549621582, + "rewards/reward_func/mean": 0.2900000214576721, + "rewards/reward_func/std": 0.5795319080352783, + "sampling/importance_sampling_ratio/max": 1.536302924156189, + "sampling/importance_sampling_ratio/mean": 0.7297533750534058, + "sampling/importance_sampling_ratio/min": 0.289122611284256, + "sampling/sampling_logp_difference/max": 0.8604832887649536, + "sampling/sampling_logp_difference/mean": 0.034349218010902405, + "step": 387, + "step_time": 61.59288591900258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 44.375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.4133331775665283, + "epoch": 0.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9855160713195801, + "kl": 0.045814886689186096, + "learning_rate": 3.5069696566113347e-06, + "loss": 0.0904, + "num_tokens": 2159078.0, + "reward": 0.33500000834465027, + "reward_std": 0.5581594705581665, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5354037284851074, + "sampling/importance_sampling_ratio/max": 1.518318772315979, + "sampling/importance_sampling_ratio/mean": 0.8139652609825134, + "sampling/importance_sampling_ratio/min": 0.37126120924949646, + "sampling/sampling_logp_difference/max": 0.5169713497161865, + "sampling/sampling_logp_difference/mean": 0.03355231136083603, + "step": 388, + "step_time": 81.17526444801479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 35.875, + "completions/mean_terminated_length": 35.875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3351132869720459, + "epoch": 0.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4382963180541992, + "kl": 0.047305479645729065, + "learning_rate": 3.499553360155923e-06, + "loss": 0.1196, + "num_tokens": 2165109.0, + "reward": 0.2150000035762787, + "reward_std": 0.5208038091659546, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.48329228162765503, + "sampling/importance_sampling_ratio/max": 1.9932667016983032, + "sampling/importance_sampling_ratio/mean": 1.2677991390228271, + "sampling/importance_sampling_ratio/min": 0.7389498949050903, + "sampling/sampling_logp_difference/max": 0.46536529064178467, + "sampling/sampling_logp_difference/mean": 0.027284495532512665, + "step": 389, + "step_time": 68.42802210498485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3567490577697754, + "epoch": 0.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0806845426559448, + "kl": 0.039206504821777344, + "learning_rate": 3.4921265788587432e-06, + "loss": -0.1312, + "num_tokens": 2170662.0, + "reward": 0.1899999976158142, + "reward_std": 0.5110079050064087, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.4738294184207916, + "sampling/importance_sampling_ratio/max": 1.3621087074279785, + "sampling/importance_sampling_ratio/mean": 0.7618493437767029, + "sampling/importance_sampling_ratio/min": 0.11757281422615051, + "sampling/sampling_logp_difference/max": 0.7672085762023926, + "sampling/sampling_logp_difference/mean": 0.029348157346248627, + "step": 390, + "step_time": 66.64952985799755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.348691463470459, + "epoch": 0.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4213788509368896, + "kl": 0.06088120490312576, + "learning_rate": 3.484689390623218e-06, + "loss": -0.2873, + "num_tokens": 2176785.0, + "reward": 0.3537500202655792, + "reward_std": 0.5490626096725464, + "rewards/reward_func/mean": 0.3537500202655792, + "rewards/reward_func/std": 0.5272554159164429, + "sampling/importance_sampling_ratio/max": 2.726332426071167, + "sampling/importance_sampling_ratio/mean": 1.269676923751831, + "sampling/importance_sampling_ratio/min": 0.4701959788799286, + "sampling/sampling_logp_difference/max": 0.5951485633850098, + "sampling/sampling_logp_difference/mean": 0.031430695205926895, + "step": 391, + "step_time": 64.82404442300322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.32772278785705566, + "epoch": 0.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8402619361877441, + "kl": 0.03273371234536171, + "learning_rate": 3.4772418734619325e-06, + "loss": 0.2288, + "num_tokens": 2182195.0, + "reward": 0.09624999761581421, + "reward_std": 0.27131104469299316, + "rewards/reward_func/mean": 0.09624999761581421, + "rewards/reward_func/std": 0.3657844066619873, + "sampling/importance_sampling_ratio/max": 2.0606470108032227, + "sampling/importance_sampling_ratio/mean": 1.0602631568908691, + "sampling/importance_sampling_ratio/min": 0.5077344179153442, + "sampling/sampling_logp_difference/max": 0.41891008615493774, + "sampling/sampling_logp_difference/mean": 0.028547827154397964, + "step": 392, + "step_time": 64.06525129399961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.32786738872528076, + "epoch": 0.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5365052223205566, + "kl": 0.044348280876874924, + "learning_rate": 3.4697841054958163e-06, + "loss": -0.1633, + "num_tokens": 2188346.0, + "reward": 0.36000001430511475, + "reward_std": 0.5519298315048218, + "rewards/reward_func/mean": 0.36000001430511475, + "rewards/reward_func/std": 0.5301482677459717, + "sampling/importance_sampling_ratio/max": 2.044487237930298, + "sampling/importance_sampling_ratio/mean": 1.0870068073272705, + "sampling/importance_sampling_ratio/min": 0.6400982141494751, + "sampling/sampling_logp_difference/max": 0.7581937313079834, + "sampling/sampling_logp_difference/mean": 0.02735818549990654, + "step": 393, + "step_time": 63.89243840900599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 41.625, + "completions/mean_terminated_length": 41.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3156575858592987, + "epoch": 0.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4243261814117432, + "kl": 0.05710726976394653, + "learning_rate": 3.4623161649533284e-06, + "loss": -0.3008, + "num_tokens": 2193765.0, + "reward": 0.32749998569488525, + "reward_std": 0.5432307720184326, + "rewards/reward_func/mean": 0.32749998569488525, + "rewards/reward_func/std": 0.5308685898780823, + "sampling/importance_sampling_ratio/max": 2.2074787616729736, + "sampling/importance_sampling_ratio/mean": 1.2915685176849365, + "sampling/importance_sampling_ratio/min": 0.6163145303726196, + "sampling/sampling_logp_difference/max": 0.40680623054504395, + "sampling/sampling_logp_difference/mean": 0.02459460124373436, + "step": 394, + "step_time": 63.77927023899974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.357902467250824, + "epoch": 0.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4730643033981323, + "kl": 0.07591858506202698, + "learning_rate": 3.4548381301696298e-06, + "loss": 0.1483, + "num_tokens": 2199321.0, + "reward": -0.03375000134110451, + "reward_std": 0.026678871363401413, + "rewards/reward_func/mean": -0.03375000134110451, + "rewards/reward_func/std": 0.025035688653588295, + "sampling/importance_sampling_ratio/max": 2.484659433364868, + "sampling/importance_sampling_ratio/mean": 1.1020760536193848, + "sampling/importance_sampling_ratio/min": 0.20723672211170197, + "sampling/sampling_logp_difference/max": 0.9204421043395996, + "sampling/sampling_logp_difference/mean": 0.03480283543467522, + "step": 395, + "step_time": 72.94622380597866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.4134517312049866, + "epoch": 0.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6859726905822754, + "kl": 0.03517558425664902, + "learning_rate": 3.4473500795857674e-06, + "loss": -0.1951, + "num_tokens": 2204573.0, + "reward": 0.20875000953674316, + "reward_std": 0.3220616579055786, + "rewards/reward_func/mean": 0.20875000953674316, + "rewards/reward_func/std": 0.47588828206062317, + "sampling/importance_sampling_ratio/max": 1.7888513803482056, + "sampling/importance_sampling_ratio/mean": 1.0129364728927612, + "sampling/importance_sampling_ratio/min": 0.49311375617980957, + "sampling/sampling_logp_difference/max": 0.5829896926879883, + "sampling/sampling_logp_difference/mean": 0.03469054028391838, + "step": 396, + "step_time": 74.89418144299998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 44.125, + "completions/mean_terminated_length": 44.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.30955445766448975, + "epoch": 0.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5693347454071045, + "kl": 0.04247763007879257, + "learning_rate": 3.4398520917478478e-06, + "loss": -0.0086, + "num_tokens": 2210029.0, + "reward": 0.08750000596046448, + "reward_std": 0.2670246660709381, + "rewards/reward_func/mean": 0.08750000596046448, + "rewards/reward_func/std": 0.36958470940589905, + "sampling/importance_sampling_ratio/max": 1.8365312814712524, + "sampling/importance_sampling_ratio/mean": 1.170079231262207, + "sampling/importance_sampling_ratio/min": 0.4552127420902252, + "sampling/sampling_logp_difference/max": 0.3448103666305542, + "sampling/sampling_logp_difference/mean": 0.02435469999909401, + "step": 397, + "step_time": 71.30152476101648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34962934255599976, + "epoch": 0.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2815780639648438, + "kl": 0.03365220129489899, + "learning_rate": 3.4323442453062173e-06, + "loss": 0.1781, + "num_tokens": 2214891.0, + "reward": 0.32499998807907104, + "reward_std": 0.5809470415115356, + "rewards/reward_func/mean": 0.32499998807907104, + "rewards/reward_func/std": 0.5604844689369202, + "sampling/importance_sampling_ratio/max": 1.4398008584976196, + "sampling/importance_sampling_ratio/mean": 0.9877474308013916, + "sampling/importance_sampling_ratio/min": 0.5344565510749817, + "sampling/sampling_logp_difference/max": 0.38164573907852173, + "sampling/sampling_logp_difference/mean": 0.02679327502846718, + "step": 398, + "step_time": 67.73330377798993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.30688661336898804, + "epoch": 0.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5855470895767212, + "kl": 0.0417325496673584, + "learning_rate": 3.4248266190146307e-06, + "loss": 0.004, + "num_tokens": 2220361.0, + "reward": 0.4762499928474426, + "reward_std": 0.6048096418380737, + "rewards/reward_func/mean": 0.4762499928474426, + "rewards/reward_func/std": 0.560661256313324, + "sampling/importance_sampling_ratio/max": 1.3558542728424072, + "sampling/importance_sampling_ratio/mean": 1.0467090606689453, + "sampling/importance_sampling_ratio/min": 0.8046448230743408, + "sampling/sampling_logp_difference/max": 0.37460851669311523, + "sampling/sampling_logp_difference/mean": 0.02754260040819645, + "step": 399, + "step_time": 63.321532267989824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.36169493198394775, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4955718517303467, + "kl": 0.05709603428840637, + "learning_rate": 3.417299291729431e-06, + "loss": -0.3635, + "num_tokens": 2225385.0, + "reward": 0.3387500047683716, + "reward_std": 0.5678717494010925, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5412007570266724, + "sampling/importance_sampling_ratio/max": 2.468524694442749, + "sampling/importance_sampling_ratio/mean": 1.3570051193237305, + "sampling/importance_sampling_ratio/min": 0.45365825295448303, + "sampling/sampling_logp_difference/max": 0.5086992979049683, + "sampling/sampling_logp_difference/mean": 0.028253626078367233, + "step": 400, + "step_time": 61.90352356500807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.34915873408317566, + "epoch": 0.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7310982346534729, + "kl": 0.028745461255311966, + "learning_rate": 3.4097623424087196e-06, + "loss": -0.1418, + "num_tokens": 2231023.0, + "reward": 0.3375000059604645, + "reward_std": 0.2833724915981293, + "rewards/reward_func/mean": 0.3375000059604645, + "rewards/reward_func/std": 0.5279272198677063, + "sampling/importance_sampling_ratio/max": 1.532747745513916, + "sampling/importance_sampling_ratio/mean": 0.7657254934310913, + "sampling/importance_sampling_ratio/min": 0.29685893654823303, + "sampling/sampling_logp_difference/max": 0.4246586561203003, + "sampling/sampling_logp_difference/mean": 0.0307551771402359, + "step": 401, + "step_time": 82.38972250098595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 43.875, + "completions/mean_terminated_length": 43.875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3424833416938782, + "epoch": 0.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.250693917274475, + "kl": 0.03432589769363403, + "learning_rate": 3.4022158501115283e-06, + "loss": -0.166, + "num_tokens": 2237005.0, + "reward": 0.21125000715255737, + "reward_std": 0.312855988740921, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.4853699207305908, + "sampling/importance_sampling_ratio/max": 1.5625203847885132, + "sampling/importance_sampling_ratio/mean": 1.0789234638214111, + "sampling/importance_sampling_ratio/min": 0.6180092096328735, + "sampling/sampling_logp_difference/max": 0.35615015029907227, + "sampling/sampling_logp_difference/mean": 0.026920361444354057, + "step": 402, + "step_time": 65.18766634300118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 45.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3446376919746399, + "epoch": 0.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2360810041427612, + "kl": 0.0331093966960907, + "learning_rate": 3.39465989399699e-06, + "loss": 0.16, + "num_tokens": 2242114.0, + "reward": 0.45625001192092896, + "reward_std": 0.6050564050674438, + "rewards/reward_func/mean": 0.45625001192092896, + "rewards/reward_func/std": 0.5601769685745239, + "sampling/importance_sampling_ratio/max": 2.0269222259521484, + "sampling/importance_sampling_ratio/mean": 1.0755150318145752, + "sampling/importance_sampling_ratio/min": 0.587192714214325, + "sampling/sampling_logp_difference/max": 0.29895949363708496, + "sampling/sampling_logp_difference/mean": 0.025805631652474403, + "step": 403, + "step_time": 54.9477135160123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3936536908149719, + "epoch": 0.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0102906227111816, + "kl": 0.059495870023965836, + "learning_rate": 3.3870945533235104e-06, + "loss": -0.0334, + "num_tokens": 2247189.0, + "reward": 0.19625000655651093, + "reward_std": 0.5299696922302246, + "rewards/reward_func/mean": 0.19625000655651093, + "rewards/reward_func/std": 0.4908865690231323, + "sampling/importance_sampling_ratio/max": 1.2130988836288452, + "sampling/importance_sampling_ratio/mean": 0.9266165494918823, + "sampling/importance_sampling_ratio/min": 0.7416336536407471, + "sampling/sampling_logp_difference/max": 0.2876337766647339, + "sampling/sampling_logp_difference/mean": 0.028024829924106598, + "step": 404, + "step_time": 71.07167344598565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.334345281124115, + "epoch": 0.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5419965982437134, + "kl": 0.025859929621219635, + "learning_rate": 3.3795199074479312e-06, + "loss": -0.1101, + "num_tokens": 2252252.0, + "reward": 0.6000000238418579, + "reward_std": 0.5447690486907959, + "rewards/reward_func/mean": 0.6000000238418579, + "rewards/reward_func/std": 0.5224940180778503, + "sampling/importance_sampling_ratio/max": 1.4799644947052002, + "sampling/importance_sampling_ratio/mean": 0.7123466730117798, + "sampling/importance_sampling_ratio/min": 0.14022274315357208, + "sampling/sampling_logp_difference/max": 0.7181998491287231, + "sampling/sampling_logp_difference/mean": 0.026170939207077026, + "step": 405, + "step_time": 48.95189845201094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3260424733161926, + "epoch": 0.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2206482887268066, + "kl": 0.07319030910730362, + "learning_rate": 3.3719360358247054e-06, + "loss": -0.1504, + "num_tokens": 2257641.0, + "reward": 0.21000000834465027, + "reward_std": 0.3072892427444458, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.47958314418792725, + "sampling/importance_sampling_ratio/max": 1.4747114181518555, + "sampling/importance_sampling_ratio/mean": 0.7886297702789307, + "sampling/importance_sampling_ratio/min": 0.39762672781944275, + "sampling/sampling_logp_difference/max": 0.3593275547027588, + "sampling/sampling_logp_difference/mean": 0.02779657021164894, + "step": 406, + "step_time": 61.756067362002796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.341819167137146, + "epoch": 0.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2889909744262695, + "kl": 0.01969139650464058, + "learning_rate": 3.3643430180050573e-06, + "loss": 0.7006, + "num_tokens": 2263126.0, + "reward": 0.4150000214576721, + "reward_std": 0.5482439398765564, + "rewards/reward_func/mean": 0.4150000214576721, + "rewards/reward_func/std": 0.5823842883110046, + "sampling/importance_sampling_ratio/max": 2.7643935680389404, + "sampling/importance_sampling_ratio/mean": 1.1803869009017944, + "sampling/importance_sampling_ratio/min": 0.6169243454933167, + "sampling/sampling_logp_difference/max": 0.3439610004425049, + "sampling/sampling_logp_difference/mean": 0.025598403066396713, + "step": 407, + "step_time": 63.48746524998569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.32550209760665894, + "epoch": 0.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5375040769577026, + "kl": 0.02752833254635334, + "learning_rate": 3.3567409336361502e-06, + "loss": -0.0216, + "num_tokens": 2268322.0, + "reward": 0.19875000417232513, + "reward_std": 0.30625003576278687, + "rewards/reward_func/mean": 0.19875000417232513, + "rewards/reward_func/std": 0.45642828941345215, + "sampling/importance_sampling_ratio/max": 2.6713926792144775, + "sampling/importance_sampling_ratio/mean": 1.2524373531341553, + "sampling/importance_sampling_ratio/min": 0.6978874206542969, + "sampling/sampling_logp_difference/max": 0.4598565101623535, + "sampling/sampling_logp_difference/mean": 0.027718737721443176, + "step": 408, + "step_time": 67.29830334399594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.36724764108657837, + "epoch": 0.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9681098461151123, + "kl": 0.05620799958705902, + "learning_rate": 3.3491298624602514e-06, + "loss": -0.1462, + "num_tokens": 2273479.0, + "reward": 0.59375, + "reward_std": 0.5756310224533081, + "rewards/reward_func/mean": 0.59375, + "rewards/reward_func/std": 0.5577746033668518, + "sampling/importance_sampling_ratio/max": 2.058076858520508, + "sampling/importance_sampling_ratio/mean": 1.0275644063949585, + "sampling/importance_sampling_ratio/min": 0.5849137306213379, + "sampling/sampling_logp_difference/max": 0.6340939998626709, + "sampling/sampling_logp_difference/mean": 0.033282943069934845, + "step": 409, + "step_time": 67.91412261300138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 39.625, + "completions/mean_terminated_length": 39.625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3391873240470886, + "epoch": 0.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0897281169891357, + "kl": 0.048063769936561584, + "learning_rate": 3.3415098843138972e-06, + "loss": -0.1435, + "num_tokens": 2279337.0, + "reward": 0.10750000923871994, + "reward_std": 0.2595524489879608, + "rewards/reward_func/mean": 0.10750000923871994, + "rewards/reward_func/std": 0.35289618372917175, + "sampling/importance_sampling_ratio/max": 2.8655083179473877, + "sampling/importance_sampling_ratio/mean": 1.0282737016677856, + "sampling/importance_sampling_ratio/min": 0.27357611060142517, + "sampling/sampling_logp_difference/max": 0.4681780934333801, + "sampling/sampling_logp_difference/mean": 0.03042689338326454, + "step": 410, + "step_time": 83.52543040498858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3415806293487549, + "epoch": 0.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6372363567352295, + "kl": 0.050147589296102524, + "learning_rate": 3.333881079127052e-06, + "loss": -0.3055, + "num_tokens": 2284889.0, + "reward": 0.21000000834465027, + "reward_std": 0.5254905223846436, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.4876474142074585, + "sampling/importance_sampling_ratio/max": 2.446664571762085, + "sampling/importance_sampling_ratio/mean": 0.9917970895767212, + "sampling/importance_sampling_ratio/min": 0.3648597002029419, + "sampling/sampling_logp_difference/max": 0.5268797874450684, + "sampling/sampling_logp_difference/mean": 0.022872356697916985, + "step": 411, + "step_time": 73.05385316698812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3670850694179535, + "epoch": 0.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2050609588623047, + "kl": 0.01863059774041176, + "learning_rate": 3.326243526922272e-06, + "loss": 0.1589, + "num_tokens": 2290321.0, + "reward": 0.19500000774860382, + "reward_std": 0.3351808488368988, + "rewards/reward_func/mean": 0.19500000774860382, + "rewards/reward_func/std": 0.49318209290504456, + "sampling/importance_sampling_ratio/max": 1.9619362354278564, + "sampling/importance_sampling_ratio/mean": 0.9046612977981567, + "sampling/importance_sampling_ratio/min": 0.5072652697563171, + "sampling/sampling_logp_difference/max": 0.5272719860076904, + "sampling/sampling_logp_difference/mean": 0.027489028871059418, + "step": 412, + "step_time": 92.1496964310063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3263223469257355, + "epoch": 0.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7847272157669067, + "kl": 0.03273576870560646, + "learning_rate": 3.3185973078138665e-06, + "loss": 0.0545, + "num_tokens": 2296019.0, + "reward": 0.20125000178813934, + "reward_std": 0.5348846316337585, + "rewards/reward_func/mean": 0.20125000178813934, + "rewards/reward_func/std": 0.4956507384777069, + "sampling/importance_sampling_ratio/max": 1.2074002027511597, + "sampling/importance_sampling_ratio/mean": 0.7049754858016968, + "sampling/importance_sampling_ratio/min": 0.45924341678619385, + "sampling/sampling_logp_difference/max": 0.36492061614990234, + "sampling/sampling_logp_difference/mean": 0.029689345508813858, + "step": 413, + "step_time": 74.78980298401439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.34388232231140137, + "epoch": 0.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1072615385055542, + "kl": 0.043772339820861816, + "learning_rate": 3.3109425020070564e-06, + "loss": 0.2443, + "num_tokens": 2301154.0, + "reward": 0.15125000476837158, + "reward_std": 0.5701001286506653, + "rewards/reward_func/mean": 0.15125000476837158, + "rewards/reward_func/std": 0.5283244848251343, + "sampling/importance_sampling_ratio/max": 1.8563178777694702, + "sampling/importance_sampling_ratio/mean": 0.9956398010253906, + "sampling/importance_sampling_ratio/min": 0.31948381662368774, + "sampling/sampling_logp_difference/max": 0.6346423625946045, + "sampling/sampling_logp_difference/mean": 0.0296938456594944, + "step": 414, + "step_time": 71.1694306099962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.350239098072052, + "epoch": 0.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9788258075714111, + "kl": 0.02329590916633606, + "learning_rate": 3.3032791897971313e-06, + "loss": 0.0095, + "num_tokens": 2306595.0, + "reward": 0.7325000166893005, + "reward_std": 0.31240540742874146, + "rewards/reward_func/mean": 0.7325000166893005, + "rewards/reward_func/std": 0.4742136597633362, + "sampling/importance_sampling_ratio/max": 1.4771513938903809, + "sampling/importance_sampling_ratio/mean": 1.0924103260040283, + "sampling/importance_sampling_ratio/min": 0.5864495038986206, + "sampling/sampling_logp_difference/max": 0.35701167583465576, + "sampling/sampling_logp_difference/mean": 0.022829465568065643, + "step": 415, + "step_time": 51.9381214719906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3615211248397827, + "epoch": 0.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6271722316741943, + "kl": 0.04001173749566078, + "learning_rate": 3.2956074515686105e-06, + "loss": 0.0667, + "num_tokens": 2311738.0, + "reward": 0.17375001311302185, + "reward_std": 0.34344157576560974, + "rewards/reward_func/mean": 0.17375001311302185, + "rewards/reward_func/std": 0.49100297689437866, + "sampling/importance_sampling_ratio/max": 2.3310201168060303, + "sampling/importance_sampling_ratio/mean": 1.2795183658599854, + "sampling/importance_sampling_ratio/min": 0.5067328214645386, + "sampling/sampling_logp_difference/max": 0.495988130569458, + "sampling/sampling_logp_difference/mean": 0.029406055808067322, + "step": 416, + "step_time": 78.19868917198619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.3619380593299866, + "epoch": 0.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2272183895111084, + "kl": 0.04641294479370117, + "learning_rate": 3.2879273677943972e-06, + "loss": -0.0374, + "num_tokens": 2317239.0, + "reward": 0.4699999988079071, + "reward_std": 0.5924452543258667, + "rewards/reward_func/mean": 0.4699999988079071, + "rewards/reward_func/std": 0.5485044717788696, + "sampling/importance_sampling_ratio/max": 1.6035174131393433, + "sampling/importance_sampling_ratio/mean": 0.8214474320411682, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.5355191230773926, + "sampling/sampling_logp_difference/mean": 0.031899720430374146, + "step": 417, + "step_time": 67.45473215699894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 43.25, + "completions/mean_terminated_length": 43.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.40914639830589294, + "epoch": 0.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7058494091033936, + "kl": 0.04687212407588959, + "learning_rate": 3.2802390190349364e-06, + "loss": 0.1611, + "num_tokens": 2323573.0, + "reward": 0.33000001311302185, + "reward_std": 0.5646458864212036, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5414794683456421, + "sampling/importance_sampling_ratio/max": 2.929563522338867, + "sampling/importance_sampling_ratio/mean": 1.1725656986236572, + "sampling/importance_sampling_ratio/min": 0.3646584451198578, + "sampling/sampling_logp_difference/max": 0.5263292789459229, + "sampling/sampling_logp_difference/mean": 0.03212471306324005, + "step": 418, + "step_time": 73.99689673900139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.388424813747406, + "epoch": 0.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2566641569137573, + "kl": 0.040666110813617706, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0819, + "num_tokens": 2329180.0, + "reward": 0.3449999988079071, + "reward_std": 0.5655855536460876, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5425863862037659, + "sampling/importance_sampling_ratio/max": 1.893444538116455, + "sampling/importance_sampling_ratio/mean": 1.027420997619629, + "sampling/importance_sampling_ratio/min": 0.41170260310173035, + "sampling/sampling_logp_difference/max": 0.336561918258667, + "sampling/sampling_logp_difference/mean": 0.02408537268638611, + "step": 419, + "step_time": 65.69701232301304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3684334456920624, + "epoch": 0.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4962432384490967, + "kl": 0.028718747198581696, + "learning_rate": 3.264837849234685e-06, + "loss": -0.1286, + "num_tokens": 2335827.0, + "reward": 0.3362500071525574, + "reward_std": 0.5644514560699463, + "rewards/reward_func/mean": 0.3362500071525574, + "rewards/reward_func/std": 0.5412403345108032, + "sampling/importance_sampling_ratio/max": 2.035574197769165, + "sampling/importance_sampling_ratio/mean": 1.2553296089172363, + "sampling/importance_sampling_ratio/min": 0.5002336502075195, + "sampling/sampling_logp_difference/max": 0.2905765771865845, + "sampling/sampling_logp_difference/mean": 0.02430140972137451, + "step": 420, + "step_time": 772.381344155001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 43.375, + "completions/mean_terminated_length": 43.375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.39142391085624695, + "epoch": 0.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1592785120010376, + "kl": 0.037018656730651855, + "learning_rate": 3.257125189744877e-06, + "loss": -0.101, + "num_tokens": 2341291.0, + "reward": 0.45375001430511475, + "reward_std": 0.6169389486312866, + "rewards/reward_func/mean": 0.45375001430511475, + "rewards/reward_func/std": 0.571687638759613, + "sampling/importance_sampling_ratio/max": 1.636826992034912, + "sampling/importance_sampling_ratio/mean": 0.8795583248138428, + "sampling/importance_sampling_ratio/min": 0.4447176456451416, + "sampling/sampling_logp_difference/max": 0.5345578193664551, + "sampling/sampling_logp_difference/mean": 0.024022206664085388, + "step": 421, + "step_time": 59.57560216600541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 44.25, + "completions/mean_terminated_length": 44.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.34517401456832886, + "epoch": 0.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.165475606918335, + "kl": 0.05213654413819313, + "learning_rate": 3.249404588370095e-06, + "loss": 0.1109, + "num_tokens": 2346050.0, + "reward": 0.3412500023841858, + "reward_std": 0.5530316829681396, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.532713770866394, + "sampling/importance_sampling_ratio/max": 2.421031951904297, + "sampling/importance_sampling_ratio/mean": 1.4897425174713135, + "sampling/importance_sampling_ratio/min": 0.8438997864723206, + "sampling/sampling_logp_difference/max": 0.3565685749053955, + "sampling/sampling_logp_difference/mean": 0.028397034853696823, + "step": 422, + "step_time": 53.3122301310068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3420068621635437, + "epoch": 0.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7624173760414124, + "kl": 0.03828991949558258, + "learning_rate": 3.2416761260957925e-06, + "loss": 0.0449, + "num_tokens": 2351785.0, + "reward": 0.17999999225139618, + "reward_std": 0.5343748331069946, + "rewards/reward_func/mean": 0.17999999225139618, + "rewards/reward_func/std": 0.49509018659591675, + "sampling/importance_sampling_ratio/max": 0.8726930022239685, + "sampling/importance_sampling_ratio/mean": 0.6684524416923523, + "sampling/importance_sampling_ratio/min": 0.5162980556488037, + "sampling/sampling_logp_difference/max": 0.7039575576782227, + "sampling/sampling_logp_difference/mean": 0.023529747501015663, + "step": 423, + "step_time": 71.27505167000345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3805294632911682, + "epoch": 0.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6922258138656616, + "kl": 0.04322695732116699, + "learning_rate": 3.233939883989882e-06, + "loss": 0.1443, + "num_tokens": 2357558.0, + "reward": 0.3450000286102295, + "reward_std": 0.24957968294620514, + "rewards/reward_func/mean": 0.3450000286102295, + "rewards/reward_func/std": 0.5098739862442017, + "sampling/importance_sampling_ratio/max": 2.186922788619995, + "sampling/importance_sampling_ratio/mean": 1.163309097290039, + "sampling/importance_sampling_ratio/min": 0.4379298686981201, + "sampling/sampling_logp_difference/max": 0.7527205944061279, + "sampling/sampling_logp_difference/mean": 0.030846048146486282, + "step": 424, + "step_time": 52.987184192985296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.380068302154541, + "epoch": 0.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7612060308456421, + "kl": 0.03679168224334717, + "learning_rate": 3.2261959432018834e-06, + "loss": -0.0798, + "num_tokens": 2362976.0, + "reward": 0.32499998807907104, + "reward_std": 0.5652101039886475, + "rewards/reward_func/mean": 0.32499998807907104, + "rewards/reward_func/std": 0.5401322841644287, + "sampling/importance_sampling_ratio/max": 1.0401641130447388, + "sampling/importance_sampling_ratio/mean": 0.5586026906967163, + "sampling/importance_sampling_ratio/min": 0.27103391289711, + "sampling/sampling_logp_difference/max": 0.5478124618530273, + "sampling/sampling_logp_difference/mean": 0.03913367539644241, + "step": 425, + "step_time": 76.52260100099375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3354133069515228, + "epoch": 0.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4465264081954956, + "kl": 0.02433839999139309, + "learning_rate": 3.218444384962071e-06, + "loss": -0.404, + "num_tokens": 2368735.0, + "reward": 0.21250000596046448, + "reward_std": 0.32150566577911377, + "rewards/reward_func/mean": 0.21250000596046448, + "rewards/reward_func/std": 0.48443636298179626, + "sampling/importance_sampling_ratio/max": 1.766361951828003, + "sampling/importance_sampling_ratio/mean": 0.968987226486206, + "sampling/importance_sampling_ratio/min": 0.29835739731788635, + "sampling/sampling_logp_difference/max": 0.7882108688354492, + "sampling/sampling_logp_difference/mean": 0.029678575694561005, + "step": 426, + "step_time": 68.1356228920049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.33666664361953735, + "epoch": 0.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9550222158432007, + "kl": 0.0564693845808506, + "learning_rate": 3.210685290580622e-06, + "loss": -0.0499, + "num_tokens": 2373721.0, + "reward": 0.3474999964237213, + "reward_std": 0.5655620098114014, + "rewards/reward_func/mean": 0.3474999964237213, + "rewards/reward_func/std": 0.5410770177841187, + "sampling/importance_sampling_ratio/max": 1.442548155784607, + "sampling/importance_sampling_ratio/mean": 0.9848485589027405, + "sampling/importance_sampling_ratio/min": 0.6200289726257324, + "sampling/sampling_logp_difference/max": 0.4186820983886719, + "sampling/sampling_logp_difference/mean": 0.02865251712501049, + "step": 427, + "step_time": 46.70685623100144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.36010992527008057, + "epoch": 0.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5365159511566162, + "kl": 0.03516993671655655, + "learning_rate": 3.2029187414467645e-06, + "loss": -0.1816, + "num_tokens": 2379614.0, + "reward": 0.22500000894069672, + "reward_std": 0.3162981867790222, + "rewards/reward_func/mean": 0.22500000894069672, + "rewards/reward_func/std": 0.47952359914779663, + "sampling/importance_sampling_ratio/max": 1.5209850072860718, + "sampling/importance_sampling_ratio/mean": 1.071610689163208, + "sampling/importance_sampling_ratio/min": 0.45961546897888184, + "sampling/sampling_logp_difference/max": 0.6144394874572754, + "sampling/sampling_logp_difference/mean": 0.033051151782274246, + "step": 428, + "step_time": 65.01070052201976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34195345640182495, + "epoch": 0.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953159749507904, + "kl": 0.032611116766929626, + "learning_rate": 3.1951448190279256e-06, + "loss": 0.0294, + "num_tokens": 2385361.0, + "reward": 0.09000000357627869, + "reward_std": 0.2679736018180847, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.3642212748527527, + "sampling/importance_sampling_ratio/max": 1.2745250463485718, + "sampling/importance_sampling_ratio/mean": 0.9374400973320007, + "sampling/importance_sampling_ratio/min": 0.4317379891872406, + "sampling/sampling_logp_difference/max": 0.44930171966552734, + "sampling/sampling_logp_difference/mean": 0.02398090809583664, + "step": 429, + "step_time": 63.217384373012464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3918173611164093, + "epoch": 0.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7345211505889893, + "kl": 0.02356931008398533, + "learning_rate": 3.1873636048688714e-06, + "loss": 0.0721, + "num_tokens": 2390785.0, + "reward": 0.2150000035762787, + "reward_std": 0.5217581987380981, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.4830853343009949, + "sampling/importance_sampling_ratio/max": 1.0560283660888672, + "sampling/importance_sampling_ratio/mean": 0.7125515937805176, + "sampling/importance_sampling_ratio/min": 0.24447228014469147, + "sampling/sampling_logp_difference/max": 1.1061149835586548, + "sampling/sampling_logp_difference/mean": 0.02791447564959526, + "step": 430, + "step_time": 69.20054752400029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3796185851097107, + "epoch": 0.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9570589661598206, + "kl": 0.03291895240545273, + "learning_rate": 3.1795751805908578e-06, + "loss": -0.0766, + "num_tokens": 2396141.0, + "reward": 0.3125, + "reward_std": 0.5881974697113037, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5682240724563599, + "sampling/importance_sampling_ratio/max": 1.2188318967819214, + "sampling/importance_sampling_ratio/mean": 0.7393078207969666, + "sampling/importance_sampling_ratio/min": 0.3626616597175598, + "sampling/sampling_logp_difference/max": 0.3476827144622803, + "sampling/sampling_logp_difference/mean": 0.029474619776010513, + "step": 431, + "step_time": 65.4366222230019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 42.875, + "completions/mean_terminated_length": 42.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3385878801345825, + "epoch": 0.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5139399766921997, + "kl": 0.041907161474227905, + "learning_rate": 3.171779627890769e-06, + "loss": 0.1129, + "num_tokens": 2400741.0, + "reward": 0.1899999976158142, + "reward_std": 0.3400847911834717, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.4960990846157074, + "sampling/importance_sampling_ratio/max": 1.7639198303222656, + "sampling/importance_sampling_ratio/mean": 1.0898196697235107, + "sampling/importance_sampling_ratio/min": 0.6894667148590088, + "sampling/sampling_logp_difference/max": 0.5719653367996216, + "sampling/sampling_logp_difference/mean": 0.028854355216026306, + "step": 432, + "step_time": 46.04387527299696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34956616163253784, + "epoch": 0.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9688817262649536, + "kl": 0.02931622788310051, + "learning_rate": 3.1639770285402632e-06, + "loss": 0.1477, + "num_tokens": 2405893.0, + "reward": 0.04375000298023224, + "reward_std": 0.3060930669307709, + "rewards/reward_func/mean": 0.04375000298023224, + "rewards/reward_func/std": 0.39467665553092957, + "sampling/importance_sampling_ratio/max": 1.536496639251709, + "sampling/importance_sampling_ratio/mean": 1.0786614418029785, + "sampling/importance_sampling_ratio/min": 0.7137445211410522, + "sampling/sampling_logp_difference/max": 0.34972822666168213, + "sampling/sampling_logp_difference/mean": 0.02250964567065239, + "step": 433, + "step_time": 69.64978085900657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.36804884672164917, + "epoch": 0.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0465831756591797, + "kl": 0.034811001271009445, + "learning_rate": 3.1561674643849173e-06, + "loss": -0.1412, + "num_tokens": 2411564.0, + "reward": 0.10000000149011612, + "reward_std": 0.26890355348587036, + "rewards/reward_func/mean": 0.10000000149011612, + "rewards/reward_func/std": 0.3607531785964966, + "sampling/importance_sampling_ratio/max": 2.2357981204986572, + "sampling/importance_sampling_ratio/mean": 0.8856201767921448, + "sampling/importance_sampling_ratio/min": 0.4398100972175598, + "sampling/sampling_logp_difference/max": 0.648827075958252, + "sampling/sampling_logp_difference/mean": 0.02694147266447544, + "step": 434, + "step_time": 78.29400004900526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3648771643638611, + "epoch": 0.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5500693321228027, + "kl": 0.049925297498703, + "learning_rate": 3.148351017343363e-06, + "loss": 0.0987, + "num_tokens": 2418201.0, + "reward": 0.21875, + "reward_std": 0.5047336220741272, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.46759071946144104, + "sampling/importance_sampling_ratio/max": 2.2972049713134766, + "sampling/importance_sampling_ratio/mean": 1.2422068119049072, + "sampling/importance_sampling_ratio/min": 0.5480000376701355, + "sampling/sampling_logp_difference/max": 0.49748849868774414, + "sampling/sampling_logp_difference/mean": 0.03015657514333725, + "step": 435, + "step_time": 80.38836584499222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 45.875, + "completions/mean_terminated_length": 45.875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3652074337005615, + "epoch": 0.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921752214431763, + "kl": 0.0575464591383934, + "learning_rate": 3.1405277694064306e-06, + "loss": -0.2442, + "num_tokens": 2423760.0, + "reward": 0.20124998688697815, + "reward_std": 0.32401105761528015, + "rewards/reward_func/mean": 0.20124998688697815, + "rewards/reward_func/std": 0.49380266666412354, + "sampling/importance_sampling_ratio/max": 1.8371641635894775, + "sampling/importance_sampling_ratio/mean": 1.028379201889038, + "sampling/importance_sampling_ratio/min": 0.34969013929367065, + "sampling/sampling_logp_difference/max": 0.47838956117630005, + "sampling/sampling_logp_difference/mean": 0.028822563588619232, + "step": 436, + "step_time": 111.30371044500498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3787916898727417, + "epoch": 0.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.373725414276123, + "kl": 0.029538137838244438, + "learning_rate": 3.1326978026362907e-06, + "loss": -0.1174, + "num_tokens": 2429732.0, + "reward": 0.1912499964237213, + "reward_std": 0.32099446654319763, + "rewards/reward_func/mean": 0.1912499964237213, + "rewards/reward_func/std": 0.4616256058216095, + "sampling/importance_sampling_ratio/max": 1.5630745887756348, + "sampling/importance_sampling_ratio/mean": 1.042021632194519, + "sampling/importance_sampling_ratio/min": 0.4928293526172638, + "sampling/sampling_logp_difference/max": 0.35456085205078125, + "sampling/sampling_logp_difference/mean": 0.02736075408756733, + "step": 437, + "step_time": 104.65821277699433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.35759854316711426, + "epoch": 0.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3202944993972778, + "kl": 0.040575023740530014, + "learning_rate": 3.1248611991655885e-06, + "loss": 0.0142, + "num_tokens": 2435583.0, + "reward": 0.2237500101327896, + "reward_std": 0.3150855600833893, + "rewards/reward_func/mean": 0.2237500101327896, + "rewards/reward_func/std": 0.4777906835079193, + "sampling/importance_sampling_ratio/max": 1.4550713300704956, + "sampling/importance_sampling_ratio/mean": 0.7871130108833313, + "sampling/importance_sampling_ratio/min": 0.40031906962394714, + "sampling/sampling_logp_difference/max": 0.5306928157806396, + "sampling/sampling_logp_difference/mean": 0.028820747509598732, + "step": 438, + "step_time": 81.86951732399757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3563615679740906, + "epoch": 0.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.279760718345642, + "kl": 0.03050798550248146, + "learning_rate": 3.1170180411965854e-06, + "loss": -0.1991, + "num_tokens": 2442392.0, + "reward": 0.36250001192092896, + "reward_std": 0.5422559976577759, + "rewards/reward_func/mean": 0.36250001192092896, + "rewards/reward_func/std": 0.519855797290802, + "sampling/importance_sampling_ratio/max": 1.3513309955596924, + "sampling/importance_sampling_ratio/mean": 0.7875853776931763, + "sampling/importance_sampling_ratio/min": 0.474069207906723, + "sampling/sampling_logp_difference/max": 0.5970335006713867, + "sampling/sampling_logp_difference/mean": 0.027284270152449608, + "step": 439, + "step_time": 78.53958937400603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.36696097254753113, + "epoch": 0.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3010104894638062, + "kl": 0.10381826758384705, + "learning_rate": 3.109168411000299e-06, + "loss": 0.0224, + "num_tokens": 2447245.0, + "reward": 0.4625000059604645, + "reward_std": 0.5960428714752197, + "rewards/reward_func/mean": 0.4625000059604645, + "rewards/reward_func/std": 0.5521063804626465, + "sampling/importance_sampling_ratio/max": 1.707277536392212, + "sampling/importance_sampling_ratio/mean": 0.8685052394866943, + "sampling/importance_sampling_ratio/min": 0.19099442660808563, + "sampling/sampling_logp_difference/max": 1.2762131690979004, + "sampling/sampling_logp_difference/mean": 0.029757626354694366, + "step": 440, + "step_time": 63.11404897898319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3573892414569855, + "epoch": 0.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6774803400039673, + "kl": 0.05356031656265259, + "learning_rate": 3.1013123909156347e-06, + "loss": -0.0621, + "num_tokens": 2452150.0, + "reward": 0.3125, + "reward_std": 0.5504498481750488, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5366496443748474, + "sampling/importance_sampling_ratio/max": 2.0549330711364746, + "sampling/importance_sampling_ratio/mean": 1.1193873882293701, + "sampling/importance_sampling_ratio/min": 0.4323776066303253, + "sampling/sampling_logp_difference/max": 0.3989245891571045, + "sampling/sampling_logp_difference/mean": 0.025564704090356827, + "step": 441, + "step_time": 76.97689333499875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.30483388900756836, + "epoch": 0.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4331544637680054, + "kl": 0.02398044988512993, + "learning_rate": 3.093450063348525e-06, + "loss": 0.364, + "num_tokens": 2457723.0, + "reward": 0.07000000029802322, + "reward_std": 0.2906396687030792, + "rewards/reward_func/mean": 0.07000000029802322, + "rewards/reward_func/std": 0.37815341353416443, + "sampling/importance_sampling_ratio/max": 2.091522216796875, + "sampling/importance_sampling_ratio/mean": 1.1620619297027588, + "sampling/importance_sampling_ratio/min": 0.6012184023857117, + "sampling/sampling_logp_difference/max": 0.30550384521484375, + "sampling/sampling_logp_difference/mean": 0.022237438708543777, + "step": 442, + "step_time": 95.15003140000044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34086257219314575, + "epoch": 0.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1229441165924072, + "kl": 0.01796545460820198, + "learning_rate": 3.085581510771067e-06, + "loss": -0.0669, + "num_tokens": 2462560.0, + "reward": 0.3512499928474426, + "reward_std": 0.5547357797622681, + "rewards/reward_func/mean": 0.3512499928474426, + "rewards/reward_func/std": 0.5352286100387573, + "sampling/importance_sampling_ratio/max": 1.212695837020874, + "sampling/importance_sampling_ratio/mean": 0.912885844707489, + "sampling/importance_sampling_ratio/min": 0.4976806342601776, + "sampling/sampling_logp_difference/max": 0.33936166763305664, + "sampling/sampling_logp_difference/mean": 0.02151985839009285, + "step": 443, + "step_time": 56.16717357101152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 45.0, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34066683053970337, + "epoch": 0.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1225097179412842, + "kl": 0.029933886602520943, + "learning_rate": 3.0777068157206535e-06, + "loss": 0.0719, + "num_tokens": 2468388.0, + "reward": 0.1899999976158142, + "reward_std": 0.5411940813064575, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.5013980269432068, + "sampling/importance_sampling_ratio/max": 1.6062737703323364, + "sampling/importance_sampling_ratio/mean": 0.7600141167640686, + "sampling/importance_sampling_ratio/min": 0.24852630496025085, + "sampling/sampling_logp_difference/max": 0.6514277458190918, + "sampling/sampling_logp_difference/mean": 0.025996115058660507, + "step": 444, + "step_time": 86.21614760000375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3731077015399933, + "epoch": 0.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1878068447113037, + "kl": 0.029690194875001907, + "learning_rate": 3.0698260607991094e-06, + "loss": -0.1014, + "num_tokens": 2473364.0, + "reward": 0.21875, + "reward_std": 0.5235260725021362, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.4851638376712799, + "sampling/importance_sampling_ratio/max": 1.600021481513977, + "sampling/importance_sampling_ratio/mean": 0.9576125144958496, + "sampling/importance_sampling_ratio/min": 0.5527809858322144, + "sampling/sampling_logp_difference/max": 0.35846877098083496, + "sampling/sampling_logp_difference/mean": 0.02683459408581257, + "step": 445, + "step_time": 66.11640017299214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.36752647161483765, + "epoch": 0.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9450146555900574, + "kl": 0.0277615524828434, + "learning_rate": 3.061939328671824e-06, + "loss": 0.1718, + "num_tokens": 2478775.0, + "reward": 0.3125, + "reward_std": 0.5557467341423035, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5402578711509705, + "sampling/importance_sampling_ratio/max": 1.8847589492797852, + "sampling/importance_sampling_ratio/mean": 0.9969915747642517, + "sampling/importance_sampling_ratio/min": 0.4349041283130646, + "sampling/sampling_logp_difference/max": 0.40544378757476807, + "sampling/sampling_logp_difference/mean": 0.02645990625023842, + "step": 446, + "step_time": 93.85621070399066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 44.375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.35863834619522095, + "epoch": 0.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6648696660995483, + "kl": 0.03483325242996216, + "learning_rate": 3.054046702066886e-06, + "loss": 0.1038, + "num_tokens": 2484436.0, + "reward": 0.5862500071525574, + "reward_std": 0.5568501353263855, + "rewards/reward_func/mean": 0.5862500071525574, + "rewards/reward_func/std": 0.5336917042732239, + "sampling/importance_sampling_ratio/max": 2.2653186321258545, + "sampling/importance_sampling_ratio/mean": 1.2341718673706055, + "sampling/importance_sampling_ratio/min": 0.42728522419929504, + "sampling/sampling_logp_difference/max": 0.7648518085479736, + "sampling/sampling_logp_difference/mean": 0.025960015133023262, + "step": 447, + "step_time": 76.71857017299044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3564414083957672, + "epoch": 0.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1071441173553467, + "kl": 0.02148180454969406, + "learning_rate": 3.0461482637742133e-06, + "loss": 0.0403, + "num_tokens": 2490437.0, + "reward": 0.32875001430511475, + "reward_std": 0.569521427154541, + "rewards/reward_func/mean": 0.32875001430511475, + "rewards/reward_func/std": 0.547577440738678, + "sampling/importance_sampling_ratio/max": 1.057119607925415, + "sampling/importance_sampling_ratio/mean": 0.8925005197525024, + "sampling/importance_sampling_ratio/min": 0.7844187021255493, + "sampling/sampling_logp_difference/max": 0.3507990837097168, + "sampling/sampling_logp_difference/mean": 0.026810673996806145, + "step": 448, + "step_time": 82.088118226995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3908958435058594, + "epoch": 0.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2796645164489746, + "kl": 0.027592984959483147, + "learning_rate": 3.0382440966446876e-06, + "loss": -0.1308, + "num_tokens": 2496183.0, + "reward": 0.07874999940395355, + "reward_std": 0.2891866862773895, + "rewards/reward_func/mean": 0.07874999940395355, + "rewards/reward_func/std": 0.37215349078178406, + "sampling/importance_sampling_ratio/max": 1.3071595430374146, + "sampling/importance_sampling_ratio/mean": 0.8888345956802368, + "sampling/importance_sampling_ratio/min": 0.33864834904670715, + "sampling/sampling_logp_difference/max": 0.3511269688606262, + "sampling/sampling_logp_difference/mean": 0.03274238109588623, + "step": 449, + "step_time": 70.07518242698279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 58.875, + "completions/mean_terminated_length": 58.875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.35438233613967896, + "epoch": 0.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3290492296218872, + "kl": 0.013173183426260948, + "learning_rate": 3.0303342835892804e-06, + "loss": -0.041, + "num_tokens": 2502767.0, + "reward": 0.23125000298023224, + "reward_std": 0.5127817392349243, + "rewards/reward_func/mean": 0.23125000298023224, + "rewards/reward_func/std": 0.4747461676597595, + "sampling/importance_sampling_ratio/max": 2.287126302719116, + "sampling/importance_sampling_ratio/mean": 1.1372334957122803, + "sampling/importance_sampling_ratio/min": 0.29354920983314514, + "sampling/sampling_logp_difference/max": 0.5065096616744995, + "sampling/sampling_logp_difference/mean": 0.025254379957914352, + "step": 450, + "step_time": 73.5957540590025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.36208608746528625, + "epoch": 0.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2244484424591064, + "kl": 0.024516358971595764, + "learning_rate": 3.0224189075781886e-06, + "loss": -0.0166, + "num_tokens": 2509154.0, + "reward": 0.20374999940395355, + "reward_std": 0.524694561958313, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.48582589626312256, + "sampling/importance_sampling_ratio/max": 1.417374610900879, + "sampling/importance_sampling_ratio/mean": 0.9122079610824585, + "sampling/importance_sampling_ratio/min": 0.3931795656681061, + "sampling/sampling_logp_difference/max": 0.4949922561645508, + "sampling/sampling_logp_difference/mean": 0.0227043554186821, + "step": 451, + "step_time": 78.99534437101102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 42.375, + "completions/mean_terminated_length": 42.375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.39959007501602173, + "epoch": 0.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0420317649841309, + "kl": 0.019628848880529404, + "learning_rate": 3.014498051639959e-06, + "loss": 0.1187, + "num_tokens": 2514772.0, + "reward": -0.05375000089406967, + "reward_std": 0.0541689358651638, + "rewards/reward_func/mean": -0.05375000089406967, + "rewards/reward_func/std": 0.05705573782324791, + "sampling/importance_sampling_ratio/max": 1.0508769750595093, + "sampling/importance_sampling_ratio/mean": 0.7083895802497864, + "sampling/importance_sampling_ratio/min": 0.29281339049339294, + "sampling/sampling_logp_difference/max": 0.9839637279510498, + "sampling/sampling_logp_difference/mean": 0.03381787985563278, + "step": 452, + "step_time": 81.53623366498505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.367666631937027, + "epoch": 0.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8917317390441895, + "kl": 0.022898491472005844, + "learning_rate": 3.006571798860626e-06, + "loss": 0.0868, + "num_tokens": 2519920.0, + "reward": 0.45625001192092896, + "reward_std": 0.6168291568756104, + "rewards/reward_func/mean": 0.45625001192092896, + "rewards/reward_func/std": 0.5718875527381897, + "sampling/importance_sampling_ratio/max": 2.3027987480163574, + "sampling/importance_sampling_ratio/mean": 1.5321813821792603, + "sampling/importance_sampling_ratio/min": 0.91518634557724, + "sampling/sampling_logp_difference/max": 0.6675479412078857, + "sampling/sampling_logp_difference/mean": 0.02647707611322403, + "step": 453, + "step_time": 71.51700325298589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3597128987312317, + "epoch": 0.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8695248365402222, + "kl": 0.044961053878068924, + "learning_rate": 2.9986402323828274e-06, + "loss": 0.0217, + "num_tokens": 2525228.0, + "reward": 0.32124999165534973, + "reward_std": 0.551304042339325, + "rewards/reward_func/mean": 0.32124999165534973, + "rewards/reward_func/std": 0.5287569761276245, + "sampling/importance_sampling_ratio/max": 1.051468014717102, + "sampling/importance_sampling_ratio/mean": 0.6988952159881592, + "sampling/importance_sampling_ratio/min": 0.27730298042297363, + "sampling/sampling_logp_difference/max": 0.5294761657714844, + "sampling/sampling_logp_difference/mean": 0.03408505767583847, + "step": 454, + "step_time": 66.79093995000585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.40276795625686646, + "epoch": 0.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8840203285217285, + "kl": 0.020732712000608444, + "learning_rate": 2.9907034354049443e-06, + "loss": -0.206, + "num_tokens": 2530621.0, + "reward": 0.22499999403953552, + "reward_std": 0.5174823999404907, + "rewards/reward_func/mean": 0.22499999403953552, + "rewards/reward_func/std": 0.4791063070297241, + "sampling/importance_sampling_ratio/max": 1.370976448059082, + "sampling/importance_sampling_ratio/mean": 0.9440828561782837, + "sampling/importance_sampling_ratio/min": 0.7049600481987, + "sampling/sampling_logp_difference/max": 0.33017855882644653, + "sampling/sampling_logp_difference/mean": 0.02502366527915001, + "step": 455, + "step_time": 65.70268553399364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3440622091293335, + "epoch": 0.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9514747262001038, + "kl": 0.02538667805492878, + "learning_rate": 2.9827614911802205e-06, + "loss": -0.2967, + "num_tokens": 2536636.0, + "reward": 0.5924999713897705, + "reward_std": 0.5447898507118225, + "rewards/reward_func/mean": 0.5924999713897705, + "rewards/reward_func/std": 0.5242614150047302, + "sampling/importance_sampling_ratio/max": 1.4159477949142456, + "sampling/importance_sampling_ratio/mean": 0.8564717769622803, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6149642467498779, + "sampling/sampling_logp_difference/mean": 0.02475292794406414, + "step": 456, + "step_time": 60.81095889999415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.36642664670944214, + "epoch": 0.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3120555877685547, + "kl": 0.024605643004179, + "learning_rate": 2.9748144830158925e-06, + "loss": 0.1126, + "num_tokens": 2542012.0, + "reward": 0.4650000035762787, + "reward_std": 0.6151281595230103, + "rewards/reward_func/mean": 0.4650000035762787, + "rewards/reward_func/std": 0.5700375437736511, + "sampling/importance_sampling_ratio/max": 1.2174410820007324, + "sampling/importance_sampling_ratio/mean": 0.8987510204315186, + "sampling/importance_sampling_ratio/min": 0.4412446916103363, + "sampling/sampling_logp_difference/max": 0.48370981216430664, + "sampling/sampling_logp_difference/mean": 0.02678913250565529, + "step": 457, + "step_time": 51.764286594989244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.375, + "completions/mean_terminated_length": 45.375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.39495640993118286, + "epoch": 0.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3088325262069702, + "kl": 0.03151445835828781, + "learning_rate": 2.966862494272316e-06, + "loss": -0.0101, + "num_tokens": 2547545.0, + "reward": 0.3399999737739563, + "reward_std": 0.554172158241272, + "rewards/reward_func/mean": 0.3399999737739563, + "rewards/reward_func/std": 0.530336856842041, + "sampling/importance_sampling_ratio/max": 1.727379560470581, + "sampling/importance_sampling_ratio/mean": 0.9480923414230347, + "sampling/importance_sampling_ratio/min": 0.2979666590690613, + "sampling/sampling_logp_difference/max": 0.5283234119415283, + "sampling/sampling_logp_difference/mean": 0.02785215526819229, + "step": 458, + "step_time": 67.89937086799182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.33337390422821045, + "epoch": 0.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1922571659088135, + "kl": 0.03265373408794403, + "learning_rate": 2.9589056083620902e-06, + "loss": -0.1628, + "num_tokens": 2552724.0, + "reward": 0.4387500286102295, + "reward_std": 0.6136727333068848, + "rewards/reward_func/mean": 0.4387500286102295, + "rewards/reward_func/std": 0.5688695311546326, + "sampling/importance_sampling_ratio/max": 1.5377517938613892, + "sampling/importance_sampling_ratio/mean": 0.7701914310455322, + "sampling/importance_sampling_ratio/min": 0.3763391673564911, + "sampling/sampling_logp_difference/max": 0.8605606555938721, + "sampling/sampling_logp_difference/mean": 0.026637043803930283, + "step": 459, + "step_time": 63.45414676400833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3421512842178345, + "epoch": 0.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6250663995742798, + "kl": 0.02636205032467842, + "learning_rate": 2.9509439087491837e-06, + "loss": 0.0988, + "num_tokens": 2558037.0, + "reward": -0.07374999672174454, + "reward_std": 0.04552318900823593, + "rewards/reward_func/mean": -0.07374999672174454, + "rewards/reward_func/std": 0.050691645592451096, + "sampling/importance_sampling_ratio/max": 2.0454702377319336, + "sampling/importance_sampling_ratio/mean": 1.0612456798553467, + "sampling/importance_sampling_ratio/min": 0.6895912885665894, + "sampling/sampling_logp_difference/max": 0.5679692029953003, + "sampling/sampling_logp_difference/mean": 0.023974724113941193, + "step": 460, + "step_time": 86.26930091198301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.36159491539001465, + "epoch": 0.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6231329441070557, + "kl": 0.03310780972242355, + "learning_rate": 2.9429774789480576e-06, + "loss": 0.0836, + "num_tokens": 2562984.0, + "reward": 0.33000001311302185, + "reward_std": 0.5563285946846008, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5400264263153076, + "sampling/importance_sampling_ratio/max": 1.5695173740386963, + "sampling/importance_sampling_ratio/mean": 1.1787632703781128, + "sampling/importance_sampling_ratio/min": 0.7933380007743835, + "sampling/sampling_logp_difference/max": 0.5508012771606445, + "sampling/sampling_logp_difference/mean": 0.028630632907152176, + "step": 461, + "step_time": 44.234594836016186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.36595630645751953, + "epoch": 0.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5493158102035522, + "kl": 0.018796313554048538, + "learning_rate": 2.93500640252279e-06, + "loss": -0.1754, + "num_tokens": 2568343.0, + "reward": 0.4675000309944153, + "reward_std": 0.6093506813049316, + "rewards/reward_func/mean": 0.4675000309944153, + "rewards/reward_func/std": 0.564212441444397, + "sampling/importance_sampling_ratio/max": 1.9704557657241821, + "sampling/importance_sampling_ratio/mean": 1.0828232765197754, + "sampling/importance_sampling_ratio/min": 0.3847387135028839, + "sampling/sampling_logp_difference/max": 0.30640411376953125, + "sampling/sampling_logp_difference/mean": 0.025095216929912567, + "step": 462, + "step_time": 58.50289387899102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3116268217563629, + "epoch": 0.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1552741527557373, + "kl": 0.02773866429924965, + "learning_rate": 2.927030763086201e-06, + "loss": -0.3653, + "num_tokens": 2573304.0, + "reward": 0.6000000238418579, + "reward_std": 0.5449049472808838, + "rewards/reward_func/mean": 0.6000000238418579, + "rewards/reward_func/std": 0.5248673558235168, + "sampling/importance_sampling_ratio/max": 1.798938512802124, + "sampling/importance_sampling_ratio/mean": 1.0102436542510986, + "sampling/importance_sampling_ratio/min": 0.319669634103775, + "sampling/sampling_logp_difference/max": 0.40699052810668945, + "sampling/sampling_logp_difference/mean": 0.027081940323114395, + "step": 463, + "step_time": 57.92807800701121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3475750684738159, + "epoch": 0.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0931487083435059, + "kl": 0.01362108439207077, + "learning_rate": 2.9190506442989753e-06, + "loss": 0.0808, + "num_tokens": 2578554.0, + "reward": 0.08124999701976776, + "reward_std": 0.2719267010688782, + "rewards/reward_func/mean": 0.08124999701976776, + "rewards/reward_func/std": 0.36041396856307983, + "sampling/importance_sampling_ratio/max": 1.1853911876678467, + "sampling/importance_sampling_ratio/mean": 0.9154686331748962, + "sampling/importance_sampling_ratio/min": 0.48412805795669556, + "sampling/sampling_logp_difference/max": 0.6382970809936523, + "sampling/sampling_logp_difference/mean": 0.022256169468164444, + "step": 464, + "step_time": 61.422123302007094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 42.125, + "completions/mean_terminated_length": 42.125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.307975709438324, + "epoch": 0.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3115994930267334, + "kl": 0.033101074397563934, + "learning_rate": 2.9110661298687824e-06, + "loss": -0.0603, + "num_tokens": 2583778.0, + "reward": 0.45875000953674316, + "reward_std": 0.6054055690765381, + "rewards/reward_func/mean": 0.45875000953674316, + "rewards/reward_func/std": 0.5611579418182373, + "sampling/importance_sampling_ratio/max": 1.275829792022705, + "sampling/importance_sampling_ratio/mean": 0.8946892619132996, + "sampling/importance_sampling_ratio/min": 0.5616273283958435, + "sampling/sampling_logp_difference/max": 0.654704213142395, + "sampling/sampling_logp_difference/mean": 0.02203410118818283, + "step": 465, + "step_time": 61.31393297199975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35501527786254883, + "epoch": 0.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.040752649307251, + "kl": 0.03077756240963936, + "learning_rate": 2.9030773035493997e-06, + "loss": 0.2758, + "num_tokens": 2589204.0, + "reward": 0.3149999976158142, + "reward_std": 0.5430054664611816, + "rewards/reward_func/mean": 0.3149999976158142, + "rewards/reward_func/std": 0.5299056172370911, + "sampling/importance_sampling_ratio/max": 2.528179883956909, + "sampling/importance_sampling_ratio/mean": 1.2432548999786377, + "sampling/importance_sampling_ratio/min": 0.5364408493041992, + "sampling/sampling_logp_difference/max": 0.34423089027404785, + "sampling/sampling_logp_difference/mean": 0.027653541415929794, + "step": 466, + "step_time": 61.11274787300499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3300337493419647, + "epoch": 0.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0090454816818237, + "kl": 0.02721918746829033, + "learning_rate": 2.8950842491398358e-06, + "loss": -0.0327, + "num_tokens": 2595236.0, + "reward": 0.22374999523162842, + "reward_std": 0.5187286734580994, + "rewards/reward_func/mean": 0.22374999523162842, + "rewards/reward_func/std": 0.4808307886123657, + "sampling/importance_sampling_ratio/max": 1.447536587715149, + "sampling/importance_sampling_ratio/mean": 0.9794137477874756, + "sampling/importance_sampling_ratio/min": 0.46334025263786316, + "sampling/sampling_logp_difference/max": 0.3176230192184448, + "sampling/sampling_logp_difference/mean": 0.022374983876943588, + "step": 467, + "step_time": 76.51882786100032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.34252166748046875, + "epoch": 0.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9320221543312073, + "kl": 0.018017075955867767, + "learning_rate": 2.8870870504834497e-06, + "loss": -0.1157, + "num_tokens": 2600730.0, + "reward": 0.07999999821186066, + "reward_std": 0.2839134931564331, + "rewards/reward_func/mean": 0.07999999821186066, + "rewards/reward_func/std": 0.3744710385799408, + "sampling/importance_sampling_ratio/max": 2.2560055255889893, + "sampling/importance_sampling_ratio/mean": 0.970880925655365, + "sampling/importance_sampling_ratio/min": 0.39924535155296326, + "sampling/sampling_logp_difference/max": 0.4781172275543213, + "sampling/sampling_logp_difference/mean": 0.025780895724892616, + "step": 468, + "step_time": 65.44530803800444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3617730140686035, + "epoch": 0.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8858951926231384, + "kl": 0.030386239290237427, + "learning_rate": 2.87908579146707e-06, + "loss": 0.1291, + "num_tokens": 2606113.0, + "reward": 0.21875, + "reward_std": 0.5210141539573669, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.48238804936408997, + "sampling/importance_sampling_ratio/max": 1.4669780731201172, + "sampling/importance_sampling_ratio/mean": 0.900518536567688, + "sampling/importance_sampling_ratio/min": 0.4941990077495575, + "sampling/sampling_logp_difference/max": 0.39029061794281006, + "sampling/sampling_logp_difference/mean": 0.022767363116145134, + "step": 469, + "step_time": 64.19338588201208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 45.0, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.34387028217315674, + "epoch": 0.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0154716968536377, + "kl": 0.02230965718626976, + "learning_rate": 2.8710805560201184e-06, + "loss": -0.1484, + "num_tokens": 2611738.0, + "reward": 0.6075000166893005, + "reward_std": 0.5653331279754639, + "rewards/reward_func/mean": 0.6075000166893005, + "rewards/reward_func/std": 0.5418421030044556, + "sampling/importance_sampling_ratio/max": 1.161827564239502, + "sampling/importance_sampling_ratio/mean": 0.8859966397285461, + "sampling/importance_sampling_ratio/min": 0.28873908519744873, + "sampling/sampling_logp_difference/max": 0.34857702255249023, + "sampling/sampling_logp_difference/mean": 0.025210872292518616, + "step": 470, + "step_time": 60.65404006501194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3458770513534546, + "epoch": 0.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0485503673553467, + "kl": 0.01893254555761814, + "learning_rate": 2.8630714281137263e-06, + "loss": 0.3028, + "num_tokens": 2617938.0, + "reward": 0.18000000715255737, + "reward_std": 0.3153059482574463, + "rewards/reward_func/mean": 0.18000000715255737, + "rewards/reward_func/std": 0.49796730279922485, + "sampling/importance_sampling_ratio/max": 1.7840207815170288, + "sampling/importance_sampling_ratio/mean": 1.1260120868682861, + "sampling/importance_sampling_ratio/min": 0.7038984298706055, + "sampling/sampling_logp_difference/max": 0.36597251892089844, + "sampling/sampling_logp_difference/mean": 0.02206684835255146, + "step": 471, + "step_time": 64.57444148999639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.4065213203430176, + "epoch": 0.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7479361891746521, + "kl": 0.022909866645932198, + "learning_rate": 2.8550584917598558e-06, + "loss": 0.0759, + "num_tokens": 2624135.0, + "reward": 0.07874999940395355, + "reward_std": 0.26868927478790283, + "rewards/reward_func/mean": 0.07874999940395355, + "rewards/reward_func/std": 0.36490458250045776, + "sampling/importance_sampling_ratio/max": 1.3143762350082397, + "sampling/importance_sampling_ratio/mean": 0.7255112528800964, + "sampling/importance_sampling_ratio/min": 0.27511295676231384, + "sampling/sampling_logp_difference/max": 0.46601831912994385, + "sampling/sampling_logp_difference/mean": 0.031219232827425003, + "step": 472, + "step_time": 83.27192145001027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3374762535095215, + "epoch": 0.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2723208665847778, + "kl": 0.022902309894561768, + "learning_rate": 2.8470418310104175e-06, + "loss": -0.2609, + "num_tokens": 2629832.0, + "reward": 0.0625, + "reward_std": 0.24701336026191711, + "rewards/reward_func/mean": 0.0625, + "rewards/reward_func/std": 0.31998884677886963, + "sampling/importance_sampling_ratio/max": 2.4061381816864014, + "sampling/importance_sampling_ratio/mean": 1.0160009860992432, + "sampling/importance_sampling_ratio/min": 0.5389451384544373, + "sampling/sampling_logp_difference/max": 0.5744847059249878, + "sampling/sampling_logp_difference/mean": 0.028143033385276794, + "step": 473, + "step_time": 73.6639021729934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3563547730445862, + "epoch": 0.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9397950172424316, + "kl": 0.022011034190654755, + "learning_rate": 2.839021529956388e-06, + "loss": 0.0807, + "num_tokens": 2635568.0, + "reward": 0.21124999225139618, + "reward_std": 0.527900218963623, + "rewards/reward_func/mean": 0.21124999225139618, + "rewards/reward_func/std": 0.48894748091697693, + "sampling/importance_sampling_ratio/max": 1.040662169456482, + "sampling/importance_sampling_ratio/mean": 0.7214508652687073, + "sampling/importance_sampling_ratio/min": 0.3372233211994171, + "sampling/sampling_logp_difference/max": 0.45850083231925964, + "sampling/sampling_logp_difference/mean": 0.02489865943789482, + "step": 474, + "step_time": 75.28077130601741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.35087794065475464, + "epoch": 0.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5176308155059814, + "kl": 0.017775265499949455, + "learning_rate": 2.8309976727269335e-06, + "loss": 0.2178, + "num_tokens": 2641222.0, + "reward": 0.06499999761581421, + "reward_std": 0.2761574685573578, + "rewards/reward_func/mean": 0.06499999761581421, + "rewards/reward_func/std": 0.37939804792404175, + "sampling/importance_sampling_ratio/max": 1.7761144638061523, + "sampling/importance_sampling_ratio/mean": 0.925238847732544, + "sampling/importance_sampling_ratio/min": 0.3248174488544464, + "sampling/sampling_logp_difference/max": 0.6076414585113525, + "sampling/sampling_logp_difference/mean": 0.02674829587340355, + "step": 475, + "step_time": 72.01986586101702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.2991679906845093, + "epoch": 0.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390182614326477, + "kl": 0.020573535934090614, + "learning_rate": 2.8229703434885165e-06, + "loss": -0.0348, + "num_tokens": 2646859.0, + "reward": 0.33250001072883606, + "reward_std": 0.5396865606307983, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.517707884311676, + "sampling/importance_sampling_ratio/max": 1.602697730064392, + "sampling/importance_sampling_ratio/mean": 0.9728833436965942, + "sampling/importance_sampling_ratio/min": 0.48104777932167053, + "sampling/sampling_logp_difference/max": 0.6165962219238281, + "sampling/sampling_logp_difference/mean": 0.023785192519426346, + "step": 476, + "step_time": 77.24283880199073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.3583204448223114, + "epoch": 0.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.142444133758545, + "kl": 0.015169752761721611, + "learning_rate": 2.814939626444023e-06, + "loss": -0.0124, + "num_tokens": 2652207.0, + "reward": 0.21375000476837158, + "reward_std": 0.5112752914428711, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.473495751619339, + "sampling/importance_sampling_ratio/max": 1.9258140325546265, + "sampling/importance_sampling_ratio/mean": 1.1217129230499268, + "sampling/importance_sampling_ratio/min": 0.8287367820739746, + "sampling/sampling_logp_difference/max": 0.3338189125061035, + "sampling/sampling_logp_difference/mean": 0.022958340123295784, + "step": 477, + "step_time": 61.92764704397996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.35365796089172363, + "epoch": 0.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9897144436836243, + "kl": 0.01738031394779682, + "learning_rate": 2.8069056058318754e-06, + "loss": 0.0097, + "num_tokens": 2658227.0, + "reward": 0.20500001311302185, + "reward_std": 0.5114267468452454, + "rewards/reward_func/mean": 0.20500001311302185, + "rewards/reward_func/std": 0.47416090965270996, + "sampling/importance_sampling_ratio/max": 2.027554750442505, + "sampling/importance_sampling_ratio/mean": 1.0578957796096802, + "sampling/importance_sampling_ratio/min": 0.6797005534172058, + "sampling/sampling_logp_difference/max": 0.5563673973083496, + "sampling/sampling_logp_difference/mean": 0.023688018321990967, + "step": 478, + "step_time": 75.16944676099229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3513268828392029, + "epoch": 0.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9755890965461731, + "kl": 0.017329782247543335, + "learning_rate": 2.7988683659251475e-06, + "loss": -0.0194, + "num_tokens": 2663497.0, + "reward": 0.10375000536441803, + "reward_std": 0.2672772705554962, + "rewards/reward_func/mean": 0.10375000536441803, + "rewards/reward_func/std": 0.3627646863460541, + "sampling/importance_sampling_ratio/max": 1.2677541971206665, + "sampling/importance_sampling_ratio/mean": 0.7190382480621338, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.48462724685668945, + "sampling/sampling_logp_difference/mean": 0.024744585156440735, + "step": 479, + "step_time": 76.88298930699239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 45.625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3445096015930176, + "epoch": 0.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1778706312179565, + "kl": 0.01838690973818302, + "learning_rate": 2.7908279910306834e-06, + "loss": 0.0279, + "num_tokens": 2669579.0, + "reward": 0.33125001192092896, + "reward_std": 0.580742359161377, + "rewards/reward_func/mean": 0.33125001192092896, + "rewards/reward_func/std": 0.5531064867973328, + "sampling/importance_sampling_ratio/max": 1.4015132188796997, + "sampling/importance_sampling_ratio/mean": 0.9845772385597229, + "sampling/importance_sampling_ratio/min": 0.40269726514816284, + "sampling/sampling_logp_difference/max": 0.5609352588653564, + "sampling/sampling_logp_difference/mean": 0.02638828381896019, + "step": 480, + "step_time": 77.41213980599423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3905279338359833, + "epoch": 0.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2462440729141235, + "kl": 0.032538220286369324, + "learning_rate": 2.7827845654882112e-06, + "loss": -0.0404, + "num_tokens": 2675067.0, + "reward": 0.07124999910593033, + "reward_std": 0.2798381745815277, + "rewards/reward_func/mean": 0.07124999910593033, + "rewards/reward_func/std": 0.3645520806312561, + "sampling/importance_sampling_ratio/max": 1.2897956371307373, + "sampling/importance_sampling_ratio/mean": 0.8797916173934937, + "sampling/importance_sampling_ratio/min": 0.4773842692375183, + "sampling/sampling_logp_difference/max": 0.3575262427330017, + "sampling/sampling_logp_difference/mean": 0.02661733888089657, + "step": 481, + "step_time": 87.84593146201223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3507845997810364, + "epoch": 0.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9909263849258423, + "kl": 0.024185102432966232, + "learning_rate": 2.7747381736694573e-06, + "loss": 0.0312, + "num_tokens": 2680053.0, + "reward": 0.3400000035762787, + "reward_std": 0.5569354891777039, + "rewards/reward_func/mean": 0.3400000035762787, + "rewards/reward_func/std": 0.5335326790809631, + "sampling/importance_sampling_ratio/max": 1.2050838470458984, + "sampling/importance_sampling_ratio/mean": 0.8115805387496948, + "sampling/importance_sampling_ratio/min": 0.21530668437480927, + "sampling/sampling_logp_difference/max": 0.41031479835510254, + "sampling/sampling_logp_difference/mean": 0.02810395136475563, + "step": 482, + "step_time": 71.15641420998145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.32739341259002686, + "epoch": 0.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7815512418746948, + "kl": 0.01184194814413786, + "learning_rate": 2.766688899977266e-06, + "loss": -0.1201, + "num_tokens": 2685381.0, + "reward": 0.06624999642372131, + "reward_std": 0.29767611622810364, + "rewards/reward_func/mean": 0.06624999642372131, + "rewards/reward_func/std": 0.38037341833114624, + "sampling/importance_sampling_ratio/max": 1.4397385120391846, + "sampling/importance_sampling_ratio/mean": 0.828331470489502, + "sampling/importance_sampling_ratio/min": 0.38339871168136597, + "sampling/sampling_logp_difference/max": 0.5013303756713867, + "sampling/sampling_logp_difference/mean": 0.020246436819434166, + "step": 483, + "step_time": 81.71323890099302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3554234504699707, + "epoch": 0.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4028494358062744, + "kl": 0.018699366599321365, + "learning_rate": 2.7586368288447094e-06, + "loss": -0.095, + "num_tokens": 2690901.0, + "reward": -0.06499999761581421, + "reward_std": 0.048902880400419235, + "rewards/reward_func/mean": -0.06499999761581421, + "rewards/reward_func/std": 0.05554920434951782, + "sampling/importance_sampling_ratio/max": 2.3254799842834473, + "sampling/importance_sampling_ratio/mean": 1.0947003364562988, + "sampling/importance_sampling_ratio/min": 0.5614188313484192, + "sampling/sampling_logp_difference/max": 0.3719151020050049, + "sampling/sampling_logp_difference/mean": 0.024835357442498207, + "step": 484, + "step_time": 80.53098407998914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 43.875, + "completions/mean_terminated_length": 43.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.35123682022094727, + "epoch": 0.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1409413814544678, + "kl": 0.026009801775217056, + "learning_rate": 2.750582044734203e-06, + "loss": -0.1372, + "num_tokens": 2696449.0, + "reward": 0.3449999988079071, + "reward_std": 0.5669803619384766, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5428759455680847, + "sampling/importance_sampling_ratio/max": 1.3252332210540771, + "sampling/importance_sampling_ratio/mean": 0.7124192714691162, + "sampling/importance_sampling_ratio/min": 0.3038400709629059, + "sampling/sampling_logp_difference/max": 0.3864710330963135, + "sampling/sampling_logp_difference/mean": 0.02677079290151596, + "step": 485, + "step_time": 51.5313537089969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.37189728021621704, + "epoch": 0.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.553479552268982, + "kl": 0.01384771429002285, + "learning_rate": 2.7425246321366205e-06, + "loss": -0.1355, + "num_tokens": 2702672.0, + "reward": -0.05000000074505806, + "reward_std": 0.03972514346241951, + "rewards/reward_func/mean": -0.05000000074505806, + "rewards/reward_func/std": 0.04105745255947113, + "sampling/importance_sampling_ratio/max": 2.5754904747009277, + "sampling/importance_sampling_ratio/mean": 1.1340928077697754, + "sampling/importance_sampling_ratio/min": 0.5375442504882812, + "sampling/sampling_logp_difference/max": 0.47546517848968506, + "sampling/sampling_logp_difference/mean": 0.02835531160235405, + "step": 486, + "step_time": 90.47422146701138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.38735634088516235, + "epoch": 0.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.343964695930481, + "kl": 0.02191627398133278, + "learning_rate": 2.7344646755704078e-06, + "loss": 0.1007, + "num_tokens": 2708209.0, + "reward": 0.0637499988079071, + "reward_std": 0.3018624186515808, + "rewards/reward_func/mean": 0.0637499988079071, + "rewards/reward_func/std": 0.3814610242843628, + "sampling/importance_sampling_ratio/max": 1.4456323385238647, + "sampling/importance_sampling_ratio/mean": 0.9162258505821228, + "sampling/importance_sampling_ratio/min": 0.4561156928539276, + "sampling/sampling_logp_difference/max": 0.5119402408599854, + "sampling/sampling_logp_difference/mean": 0.028758030384778976, + "step": 487, + "step_time": 92.49751431899494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3179323077201843, + "epoch": 0.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9897328615188599, + "kl": 0.04169601947069168, + "learning_rate": 2.726402259580695e-06, + "loss": 0.0601, + "num_tokens": 2713886.0, + "reward": 0.33500000834465027, + "reward_std": 0.2701554596424103, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5461553931236267, + "sampling/importance_sampling_ratio/max": 1.5669019222259521, + "sampling/importance_sampling_ratio/mean": 0.9279680252075195, + "sampling/importance_sampling_ratio/min": 0.5139185786247253, + "sampling/sampling_logp_difference/max": 0.6310797929763794, + "sampling/sampling_logp_difference/mean": 0.023750916123390198, + "step": 488, + "step_time": 49.78819806300453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.33032259345054626, + "epoch": 0.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6407569050788879, + "kl": 0.02176561951637268, + "learning_rate": 2.71833746873841e-06, + "loss": -0.0798, + "num_tokens": 2718931.0, + "reward": 0.44875001907348633, + "reward_std": 0.5220805406570435, + "rewards/reward_func/mean": 0.44875001907348633, + "rewards/reward_func/std": 0.5664535760879517, + "sampling/importance_sampling_ratio/max": 1.1846626996994019, + "sampling/importance_sampling_ratio/mean": 0.8170421123504639, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.2127480506896973, + "sampling/sampling_logp_difference/mean": 0.020459800958633423, + "step": 489, + "step_time": 71.78667046700139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3335261344909668, + "epoch": 0.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9051764011383057, + "kl": 0.027933314442634583, + "learning_rate": 2.7102703876393942e-06, + "loss": 0.03, + "num_tokens": 2723945.0, + "reward": 0.20000000298023224, + "reward_std": 0.5354849100112915, + "rewards/reward_func/mean": 0.20000000298023224, + "rewards/reward_func/std": 0.49638697504997253, + "sampling/importance_sampling_ratio/max": 1.6904243230819702, + "sampling/importance_sampling_ratio/mean": 0.8453304767608643, + "sampling/importance_sampling_ratio/min": 0.358101487159729, + "sampling/sampling_logp_difference/max": 0.6200103759765625, + "sampling/sampling_logp_difference/mean": 0.019122183322906494, + "step": 490, + "step_time": 73.55681845199433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.34773534536361694, + "epoch": 0.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5184320211410522, + "kl": 0.029565483331680298, + "learning_rate": 2.702201100903511e-06, + "loss": 0.2018, + "num_tokens": 2730051.0, + "reward": 0.36625000834465027, + "reward_std": 0.5476330518722534, + "rewards/reward_func/mean": 0.36625000834465027, + "rewards/reward_func/std": 0.5249200463294983, + "sampling/importance_sampling_ratio/max": 1.7158502340316772, + "sampling/importance_sampling_ratio/mean": 0.9216998815536499, + "sampling/importance_sampling_ratio/min": 0.45285990834236145, + "sampling/sampling_logp_difference/max": 0.6381690502166748, + "sampling/sampling_logp_difference/mean": 0.027182936668395996, + "step": 491, + "step_time": 56.38132729998324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.46129417419433594, + "epoch": 0.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9216625690460205, + "kl": 0.029165223240852356, + "learning_rate": 2.694129693173759e-06, + "loss": -0.0598, + "num_tokens": 2735276.0, + "reward": 0.4675000309944153, + "reward_std": 0.5070215463638306, + "rewards/reward_func/mean": 0.4675000309944153, + "rewards/reward_func/std": 0.5406543612480164, + "sampling/importance_sampling_ratio/max": 1.0167001485824585, + "sampling/importance_sampling_ratio/mean": 0.7158698439598083, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7142941951751709, + "sampling/sampling_logp_difference/mean": 0.027627810835838318, + "step": 492, + "step_time": 76.63889957600622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3078922629356384, + "epoch": 0.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0626574754714966, + "kl": 0.023861799389123917, + "learning_rate": 2.6860562491153854e-06, + "loss": -0.11, + "num_tokens": 2740801.0, + "reward": 0.2199999988079071, + "reward_std": 0.5209156274795532, + "rewards/reward_func/mean": 0.2199999988079071, + "rewards/reward_func/std": 0.4826415479183197, + "sampling/importance_sampling_ratio/max": 1.2552741765975952, + "sampling/importance_sampling_ratio/mean": 0.95084547996521, + "sampling/importance_sampling_ratio/min": 0.46218550205230713, + "sampling/sampling_logp_difference/max": 0.9027338027954102, + "sampling/sampling_logp_difference/mean": 0.023295089602470398, + "step": 493, + "step_time": 62.630216120014666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.33969372510910034, + "epoch": 0.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0334094762802124, + "kl": 0.021102532744407654, + "learning_rate": 2.6779808534149986e-06, + "loss": 0.0949, + "num_tokens": 2746644.0, + "reward": 0.09999999403953552, + "reward_std": 0.2607312798500061, + "rewards/reward_func/mean": 0.09999999403953552, + "rewards/reward_func/std": 0.3642212748527527, + "sampling/importance_sampling_ratio/max": 1.5536633729934692, + "sampling/importance_sampling_ratio/mean": 0.9165699481964111, + "sampling/importance_sampling_ratio/min": 0.5814899802207947, + "sampling/sampling_logp_difference/max": 0.7713108062744141, + "sampling/sampling_logp_difference/mean": 0.022944262251257896, + "step": 494, + "step_time": 97.2119018859812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3434738516807556, + "epoch": 0.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3791193962097168, + "kl": 0.025687772780656815, + "learning_rate": 2.6699035907796796e-06, + "loss": 0.2039, + "num_tokens": 2752279.0, + "reward": 0.20499999821186066, + "reward_std": 0.5269919633865356, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.4880281090736389, + "sampling/importance_sampling_ratio/max": 1.5326544046401978, + "sampling/importance_sampling_ratio/mean": 1.1358022689819336, + "sampling/importance_sampling_ratio/min": 0.7314006090164185, + "sampling/sampling_logp_difference/max": 0.5475611686706543, + "sampling/sampling_logp_difference/mean": 0.031097054481506348, + "step": 495, + "step_time": 91.44340489199385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3790108859539032, + "epoch": 0.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1053639650344849, + "kl": 0.021480565890669823, + "learning_rate": 2.6618245459360896e-06, + "loss": -0.2536, + "num_tokens": 2757556.0, + "reward": 0.0937500074505806, + "reward_std": 0.27560853958129883, + "rewards/reward_func/mean": 0.0937500074505806, + "rewards/reward_func/std": 0.36769309639930725, + "sampling/importance_sampling_ratio/max": 1.4687750339508057, + "sampling/importance_sampling_ratio/mean": 0.9616619348526001, + "sampling/importance_sampling_ratio/min": 0.2960628867149353, + "sampling/sampling_logp_difference/max": 0.517666220664978, + "sampling/sampling_logp_difference/mean": 0.028408560901880264, + "step": 496, + "step_time": 87.52318387202104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.31511783599853516, + "epoch": 0.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8182615637779236, + "kl": 0.014653578400611877, + "learning_rate": 2.6537438036295876e-06, + "loss": -0.0539, + "num_tokens": 2763537.0, + "reward": 0.45750001072883606, + "reward_std": 0.5164840221405029, + "rewards/reward_func/mean": 0.45750001072883606, + "rewards/reward_func/std": 0.5492787957191467, + "sampling/importance_sampling_ratio/max": 1.4413000345230103, + "sampling/importance_sampling_ratio/mean": 0.7662212252616882, + "sampling/importance_sampling_ratio/min": 0.33490437269210815, + "sampling/sampling_logp_difference/max": 0.8015744686126709, + "sampling/sampling_logp_difference/mean": 0.022109784185886383, + "step": 497, + "step_time": 83.04217743998743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34117379784584045, + "epoch": 0.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.383134365081787, + "kl": 0.03160976245999336, + "learning_rate": 2.6456614486233344e-06, + "loss": 0.0937, + "num_tokens": 2768283.0, + "reward": 0.45125001668930054, + "reward_std": 0.611153244972229, + "rewards/reward_func/mean": 0.45125001668930054, + "rewards/reward_func/std": 0.5658984780311584, + "sampling/importance_sampling_ratio/max": 1.6628714799880981, + "sampling/importance_sampling_ratio/mean": 1.1422840356826782, + "sampling/importance_sampling_ratio/min": 0.6167079210281372, + "sampling/sampling_logp_difference/max": 0.4831216335296631, + "sampling/sampling_logp_difference/mean": 0.025718016549944878, + "step": 498, + "step_time": 52.98551483498886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.40990570187568665, + "epoch": 0.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3361786603927612, + "kl": 0.02863333187997341, + "learning_rate": 2.6375775656974124e-06, + "loss": 0.1209, + "num_tokens": 2773418.0, + "reward": 0.33250001072883606, + "reward_std": 0.5637004375457764, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.545494556427002, + "sampling/importance_sampling_ratio/max": 1.6512422561645508, + "sampling/importance_sampling_ratio/mean": 1.0369747877120972, + "sampling/importance_sampling_ratio/min": 0.7347527146339417, + "sampling/sampling_logp_difference/max": 0.4192899465560913, + "sampling/sampling_logp_difference/mean": 0.0260856244713068, + "step": 499, + "step_time": 81.9041408339981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.36693620681762695, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0463021993637085, + "kl": 0.029872596263885498, + "learning_rate": 2.6294922396479263e-06, + "loss": -0.2292, + "num_tokens": 2778968.0, + "reward": 0.20874999463558197, + "reward_std": 0.3164796531200409, + "rewards/reward_func/mean": 0.20874999463558197, + "rewards/reward_func/std": 0.4829207956790924, + "sampling/importance_sampling_ratio/max": 1.7435824871063232, + "sampling/importance_sampling_ratio/mean": 0.9001740217208862, + "sampling/importance_sampling_ratio/min": 0.30285191535949707, + "sampling/sampling_logp_difference/max": 0.6381608247756958, + "sampling/sampling_logp_difference/mean": 0.031203145161271095, + "step": 500, + "step_time": 110.34948237799108 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 2778968, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}