{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35832011699676514, "epoch": 0.002, "frac_reward_zero_std": 0.0, "grad_norm": 1.23671293258667, "kl": 0.0, "learning_rate": 0.0, "loss": 0.2758, "num_tokens": 5417.0, "reward": 0.4775000214576721, "reward_std": 0.5056283473968506, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5403900742530823, "sampling/importance_sampling_ratio/max": 2.4071154594421387, "sampling/importance_sampling_ratio/mean": 1.1429595947265625, "sampling/importance_sampling_ratio/min": 0.5015585422515869, "sampling/sampling_logp_difference/max": 0.5305562019348145, "sampling/sampling_logp_difference/mean": 0.024324804544448853, "step": 1, "step_time": 29.307177749986295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3473261594772339, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 3.3394556045532227, "kl": 0.0, "learning_rate": 1.6666666666666668e-07, "loss": 0.2918, "num_tokens": 11253.0, "reward": 0.581250011920929, "reward_std": 0.5712425708770752, "rewards/reward_func/mean": 0.581250011920929, "rewards/reward_func/std": 0.5513473749160767, "sampling/importance_sampling_ratio/max": 2.3380353450775146, "sampling/importance_sampling_ratio/mean": 1.2109484672546387, "sampling/importance_sampling_ratio/min": 0.4137703776359558, "sampling/sampling_logp_difference/max": 0.6683757305145264, "sampling/sampling_logp_difference/mean": 0.024658963084220886, "step": 2, "step_time": 40.91707400101586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.36352208256721497, "epoch": 0.006, "frac_reward_zero_std": 0.0, "grad_norm": 1.7528427839279175, "kl": 0.0018581235781311989, "learning_rate": 3.3333333333333335e-07, "loss": 0.0856, "num_tokens": 16645.0, "reward": 0.22500000894069672, "reward_std": 0.3063344955444336, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.4666905105113983, "sampling/importance_sampling_ratio/max": 1.6700822114944458, "sampling/importance_sampling_ratio/mean": 1.325523018836975, "sampling/importance_sampling_ratio/min": 0.6139910221099854, "sampling/sampling_logp_difference/max": 0.3466939926147461, "sampling/sampling_logp_difference/mean": 0.0239357128739357, "step": 3, "step_time": 33.77775888898759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.335945725440979, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 1.072298526763916, "kl": 0.0018037607660517097, "learning_rate": 5.000000000000001e-07, "loss": 0.1746, "num_tokens": 22430.0, "reward": 0.21375000476837158, "reward_std": 0.5115964412689209, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.47388777136802673, "sampling/importance_sampling_ratio/max": 2.130910873413086, "sampling/importance_sampling_ratio/mean": 0.9638596773147583, "sampling/importance_sampling_ratio/min": 0.3092893362045288, "sampling/sampling_logp_difference/max": 0.9354848861694336, "sampling/sampling_logp_difference/mean": 0.022302545607089996, "step": 4, "step_time": 34.65735469799256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3547826111316681, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 1.7712143659591675, "kl": 0.0015525126364082098, "learning_rate": 6.666666666666667e-07, "loss": 0.1898, "num_tokens": 28484.0, "reward": 0.20000000298023224, "reward_std": 0.49363037943840027, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.4572901427745819, "sampling/importance_sampling_ratio/max": 2.8111071586608887, "sampling/importance_sampling_ratio/mean": 1.2563235759735107, "sampling/importance_sampling_ratio/min": 0.7284324169158936, "sampling/sampling_logp_difference/max": 0.39002323150634766, "sampling/sampling_logp_difference/mean": 0.02487805485725403, "step": 5, "step_time": 39.42609074199572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.4256824254989624, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 1.2577799558639526, "kl": 0.002119219396263361, "learning_rate": 8.333333333333333e-07, "loss": -0.1565, "num_tokens": 33246.0, "reward": 0.48250001668930054, "reward_std": 0.5949929356575012, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5508629679679871, "sampling/importance_sampling_ratio/max": 1.764662742614746, "sampling/importance_sampling_ratio/mean": 1.1115164756774902, "sampling/importance_sampling_ratio/min": 0.4326048195362091, "sampling/sampling_logp_difference/max": 0.35713624954223633, "sampling/sampling_logp_difference/mean": 0.023226505145430565, "step": 6, "step_time": 25.11293228599243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3634033203125, "epoch": 0.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.7828105688095093, "kl": 0.0015323495026677847, "learning_rate": 1.0000000000000002e-06, "loss": 0.0796, "num_tokens": 39042.0, "reward": 0.3125, "reward_std": 0.5887748003005981, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5680983662605286, "sampling/importance_sampling_ratio/max": 1.3392544984817505, "sampling/importance_sampling_ratio/mean": 0.7953487634658813, "sampling/importance_sampling_ratio/min": 0.4173814654350281, "sampling/sampling_logp_difference/max": 0.29545068740844727, "sampling/sampling_logp_difference/mean": 0.025280017405748367, "step": 7, "step_time": 47.24140843501664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 43.375, "completions/mean_terminated_length": 43.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3641508221626282, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 1.1968954801559448, "kl": 0.0013946478720754385, "learning_rate": 1.1666666666666668e-06, "loss": -0.1307, "num_tokens": 44922.0, "reward": 0.22875000536441803, "reward_std": 0.2921527922153473, "rewards/reward_func/mean": 0.22875000536441803, "rewards/reward_func/std": 0.4607583284378052, "sampling/importance_sampling_ratio/max": 1.5681222677230835, "sampling/importance_sampling_ratio/mean": 1.014966368675232, "sampling/importance_sampling_ratio/min": 0.7567934393882751, "sampling/sampling_logp_difference/max": 0.34651947021484375, "sampling/sampling_logp_difference/mean": 0.01997371017932892, "step": 8, "step_time": 34.33773313398706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.625, "completions/mean_terminated_length": 55.625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.35281139612197876, "epoch": 0.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.8254293203353882, "kl": 0.002236333442851901, "learning_rate": 1.3333333333333334e-06, "loss": 0.0283, "num_tokens": 50617.0, "reward": 0.46875, "reward_std": 0.5300248861312866, "rewards/reward_func/mean": 0.46875, "rewards/reward_func/std": 0.5659489631652832, "sampling/importance_sampling_ratio/max": 1.2048767805099487, "sampling/importance_sampling_ratio/mean": 0.7666900157928467, "sampling/importance_sampling_ratio/min": 0.39571237564086914, "sampling/sampling_logp_difference/max": 0.35016971826553345, "sampling/sampling_logp_difference/mean": 0.025727007538080215, "step": 9, "step_time": 34.18884765400435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3982703983783722, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 1.185789942741394, "kl": 0.001458184327930212, "learning_rate": 1.5e-06, "loss": 0.0761, "num_tokens": 56268.0, "reward": 0.07750000059604645, "reward_std": 0.28465136885643005, "rewards/reward_func/mean": 0.07750000059604645, "rewards/reward_func/std": 0.3708580732345581, "sampling/importance_sampling_ratio/max": 2.0030765533447266, "sampling/importance_sampling_ratio/mean": 0.9082742929458618, "sampling/importance_sampling_ratio/min": 0.42338261008262634, "sampling/sampling_logp_difference/max": 0.4783933162689209, "sampling/sampling_logp_difference/mean": 0.023844268172979355, "step": 10, "step_time": 39.159437736991094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3613673448562622, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.8812184929847717, "kl": 0.0014497374650090933, "learning_rate": 1.6666666666666667e-06, "loss": -0.0696, "num_tokens": 62534.0, "reward": 0.3400000035762787, "reward_std": 0.27956950664520264, "rewards/reward_func/mean": 0.3400000035762787, "rewards/reward_func/std": 0.543007493019104, "sampling/importance_sampling_ratio/max": 1.5621131658554077, "sampling/importance_sampling_ratio/mean": 0.8559645414352417, "sampling/importance_sampling_ratio/min": 0.45671403408050537, "sampling/sampling_logp_difference/max": 0.3955717086791992, "sampling/sampling_logp_difference/mean": 0.02080589532852173, "step": 11, "step_time": 39.89227997799753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.39176082611083984, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 1.1600431203842163, "kl": 0.0020833718590438366, "learning_rate": 1.8333333333333333e-06, "loss": -0.0961, "num_tokens": 68151.0, "reward": 0.32374998927116394, "reward_std": 0.5406870245933533, "rewards/reward_func/mean": 0.32374998927116394, "rewards/reward_func/std": 0.5189808011054993, "sampling/importance_sampling_ratio/max": 2.046029806137085, "sampling/importance_sampling_ratio/mean": 1.0404480695724487, "sampling/importance_sampling_ratio/min": 0.48177048563957214, "sampling/sampling_logp_difference/max": 0.2973281145095825, "sampling/sampling_logp_difference/mean": 0.024639006704092026, "step": 12, "step_time": 48.536910057999194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 43.375, "completions/mean_terminated_length": 43.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3269670605659485, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.8620632886886597, "kl": 0.001273418078199029, "learning_rate": 2.0000000000000003e-06, "loss": -0.081, "num_tokens": 73963.0, "reward": 0.3425000011920929, "reward_std": 0.5563790202140808, "rewards/reward_func/mean": 0.3425000011920929, "rewards/reward_func/std": 0.5344623923301697, "sampling/importance_sampling_ratio/max": 1.4767922163009644, "sampling/importance_sampling_ratio/mean": 0.8396698236465454, "sampling/importance_sampling_ratio/min": 0.5644444823265076, "sampling/sampling_logp_difference/max": 0.2883424758911133, "sampling/sampling_logp_difference/mean": 0.024868279695510864, "step": 13, "step_time": 33.96547721300158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.31234925985336304, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 1.1137773990631104, "kl": 0.0012005593162029982, "learning_rate": 2.166666666666667e-06, "loss": 0.3334, "num_tokens": 78838.0, "reward": 0.29625001549720764, "reward_std": 0.6014425754547119, "rewards/reward_func/mean": 0.29625001549720764, "rewards/reward_func/std": 0.5761184692382812, "sampling/importance_sampling_ratio/max": 2.09089994430542, "sampling/importance_sampling_ratio/mean": 1.2477295398712158, "sampling/importance_sampling_ratio/min": 0.702942430973053, "sampling/sampling_logp_difference/max": 0.46815013885498047, "sampling/sampling_logp_difference/mean": 0.019913293421268463, "step": 14, "step_time": 33.421214936010074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3603006601333618, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 1.8420554399490356, "kl": 0.001688068499788642, "learning_rate": 2.3333333333333336e-06, "loss": 0.3399, "num_tokens": 84243.0, "reward": 0.07874999940395355, "reward_std": 0.2735734283924103, "rewards/reward_func/mean": 0.07874999940395355, "rewards/reward_func/std": 0.3578681945800781, "sampling/importance_sampling_ratio/max": 2.986236095428467, "sampling/importance_sampling_ratio/mean": 1.2305893898010254, "sampling/importance_sampling_ratio/min": 0.7438207864761353, "sampling/sampling_logp_difference/max": 0.5467426776885986, "sampling/sampling_logp_difference/mean": 0.024384144693613052, "step": 15, "step_time": 35.22606979601551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3401448726654053, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 1.2467572689056396, "kl": 0.0013698764378204942, "learning_rate": 2.5e-06, "loss": 0.0465, "num_tokens": 89603.0, "reward": 0.1887499988079071, "reward_std": 0.33193475008010864, "rewards/reward_func/mean": 0.1887499988079071, "rewards/reward_func/std": 0.48774808645248413, "sampling/importance_sampling_ratio/max": 1.0488877296447754, "sampling/importance_sampling_ratio/mean": 0.8098611831665039, "sampling/importance_sampling_ratio/min": 0.5529040694236755, "sampling/sampling_logp_difference/max": 0.4784054756164551, "sampling/sampling_logp_difference/mean": 0.021436292678117752, "step": 16, "step_time": 40.98757715098327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 46.875, "completions/mean_terminated_length": 46.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3725942373275757, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 1.417001724243164, "kl": 0.0016124111134558916, "learning_rate": 2.666666666666667e-06, "loss": 0.0637, "num_tokens": 94945.0, "reward": 0.05000000447034836, "reward_std": 0.2862437069416046, "rewards/reward_func/mean": 0.05000000447034836, "rewards/reward_func/std": 0.38652294874191284, "sampling/importance_sampling_ratio/max": 1.8293613195419312, "sampling/importance_sampling_ratio/mean": 1.3590400218963623, "sampling/importance_sampling_ratio/min": 0.8256513476371765, "sampling/sampling_logp_difference/max": 0.3571474552154541, "sampling/sampling_logp_difference/mean": 0.020312845706939697, "step": 17, "step_time": 37.4742742870003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3666185140609741, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 1.168238639831543, "kl": 0.001655534841120243, "learning_rate": 2.8333333333333335e-06, "loss": -0.1929, "num_tokens": 100888.0, "reward": 0.20124998688697815, "reward_std": 0.5236045122146606, "rewards/reward_func/mean": 0.20124998688697815, "rewards/reward_func/std": 0.48489874601364136, "sampling/importance_sampling_ratio/max": 1.673153281211853, "sampling/importance_sampling_ratio/mean": 1.0230400562286377, "sampling/importance_sampling_ratio/min": 0.5740097165107727, "sampling/sampling_logp_difference/max": 0.27298808097839355, "sampling/sampling_logp_difference/mean": 0.02411050722002983, "step": 18, "step_time": 36.143502769002225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.37803328037261963, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.9871008396148682, "kl": 0.002036505378782749, "learning_rate": 3e-06, "loss": -0.0558, "num_tokens": 106144.0, "reward": 0.35625001788139343, "reward_std": 0.5298318266868591, "rewards/reward_func/mean": 0.35625001788139343, "rewards/reward_func/std": 0.5088625550270081, "sampling/importance_sampling_ratio/max": 1.4062168598175049, "sampling/importance_sampling_ratio/mean": 0.9718549251556396, "sampling/importance_sampling_ratio/min": 0.3938085734844208, "sampling/sampling_logp_difference/max": 0.3405449390411377, "sampling/sampling_logp_difference/mean": 0.02122277393937111, "step": 19, "step_time": 39.01482636100263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.4097254276275635, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 1.3784204721450806, "kl": 0.0016567106358706951, "learning_rate": 3.1666666666666667e-06, "loss": 0.0886, "num_tokens": 112107.0, "reward": -0.057499997317790985, "reward_std": 0.044269345700740814, "rewards/reward_func/mean": -0.057499997317790985, "rewards/reward_func/std": 0.04166190326213837, "sampling/importance_sampling_ratio/max": 2.0899689197540283, "sampling/importance_sampling_ratio/mean": 1.132345199584961, "sampling/importance_sampling_ratio/min": 0.4411206543445587, "sampling/sampling_logp_difference/max": 0.5205492973327637, "sampling/sampling_logp_difference/mean": 0.027103282511234283, "step": 20, "step_time": 45.0223436219967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.36454537510871887, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.7992667555809021, "kl": 0.0020659861620515585, "learning_rate": 3.3333333333333333e-06, "loss": 0.1669, "num_tokens": 117999.0, "reward": 0.32624998688697815, "reward_std": 0.5471616387367249, "rewards/reward_func/mean": 0.32624998688697815, "rewards/reward_func/std": 0.5324857831001282, "sampling/importance_sampling_ratio/max": 1.8343143463134766, "sampling/importance_sampling_ratio/mean": 0.8793189525604248, "sampling/importance_sampling_ratio/min": 0.3384288549423218, "sampling/sampling_logp_difference/max": 0.4840106964111328, "sampling/sampling_logp_difference/mean": 0.02400803565979004, "step": 21, "step_time": 40.175860888994066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.40864837169647217, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.843110978603363, "kl": 0.001421776250936091, "learning_rate": 3.5e-06, "loss": -0.0948, "num_tokens": 123708.0, "reward": 0.4725000262260437, "reward_std": 0.5133668184280396, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.549278736114502, "sampling/importance_sampling_ratio/max": 1.4361987113952637, "sampling/importance_sampling_ratio/mean": 0.8868111371994019, "sampling/importance_sampling_ratio/min": 0.42872440814971924, "sampling/sampling_logp_difference/max": 0.33927369117736816, "sampling/sampling_logp_difference/mean": 0.02582230418920517, "step": 22, "step_time": 168.2754703540122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.32568132877349854, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 1.166347861289978, "kl": 0.0016544836107641459, "learning_rate": 3.6666666666666666e-06, "loss": -0.0458, "num_tokens": 129149.0, "reward": 0.19249999523162842, "reward_std": 0.5302917957305908, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.4909684658050537, "sampling/importance_sampling_ratio/max": 1.6518144607543945, "sampling/importance_sampling_ratio/mean": 0.894943118095398, "sampling/importance_sampling_ratio/min": 0.5825864672660828, "sampling/sampling_logp_difference/max": 0.48093175888061523, "sampling/sampling_logp_difference/mean": 0.02260264754295349, "step": 23, "step_time": 125.51062903201091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3474411070346832, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 1.0562515258789062, "kl": 0.001693481463007629, "learning_rate": 3.833333333333334e-06, "loss": 0.2041, "num_tokens": 134846.0, "reward": 0.08125000447034836, "reward_std": 0.2956419289112091, "rewards/reward_func/mean": 0.08125000447034836, "rewards/reward_func/std": 0.3755924105644226, "sampling/importance_sampling_ratio/max": 2.1531643867492676, "sampling/importance_sampling_ratio/mean": 1.043798565864563, "sampling/importance_sampling_ratio/min": 0.529705822467804, "sampling/sampling_logp_difference/max": 0.34720849990844727, "sampling/sampling_logp_difference/mean": 0.01930052787065506, "step": 24, "step_time": 167.04036519900546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3871381878852844, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3111368417739868, "kl": 0.0014404752291738987, "learning_rate": 4.000000000000001e-06, "loss": -0.0471, "num_tokens": 140378.0, "reward": 0.36124998331069946, "reward_std": 0.5499054193496704, "rewards/reward_func/mean": 0.36124998331069946, "rewards/reward_func/std": 0.5263469219207764, "sampling/importance_sampling_ratio/max": 1.4156557321548462, "sampling/importance_sampling_ratio/mean": 1.1120198965072632, "sampling/importance_sampling_ratio/min": 0.7486764788627625, "sampling/sampling_logp_difference/max": 0.48737621307373047, "sampling/sampling_logp_difference/mean": 0.023105096071958542, "step": 25, "step_time": 120.95955076700193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.39792579412460327, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 1.396022915840149, "kl": 0.0015975476708263159, "learning_rate": 4.166666666666667e-06, "loss": -0.0444, "num_tokens": 146478.0, "reward": 0.2224999964237213, "reward_std": 0.31392672657966614, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.4807955026626587, "sampling/importance_sampling_ratio/max": 1.508078932762146, "sampling/importance_sampling_ratio/mean": 1.0499423742294312, "sampling/importance_sampling_ratio/min": 0.5942177772521973, "sampling/sampling_logp_difference/max": 0.3570747375488281, "sampling/sampling_logp_difference/mean": 0.024486597627401352, "step": 26, "step_time": 104.28418873299961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3486085534095764, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 1.1279908418655396, "kl": 0.0019664729479700327, "learning_rate": 4.333333333333334e-06, "loss": -0.1021, "num_tokens": 151586.0, "reward": 0.3425000011920929, "reward_std": 0.2686923146247864, "rewards/reward_func/mean": 0.3425000011920929, "rewards/reward_func/std": 0.5363301634788513, "sampling/importance_sampling_ratio/max": 1.8593271970748901, "sampling/importance_sampling_ratio/mean": 1.1785297393798828, "sampling/importance_sampling_ratio/min": 0.5566311478614807, "sampling/sampling_logp_difference/max": 0.4686328172683716, "sampling/sampling_logp_difference/mean": 0.022173412144184113, "step": 27, "step_time": 85.11922752400278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.37405920028686523, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.8106129765510559, "kl": 0.0015516983112320304, "learning_rate": 4.5e-06, "loss": 0.0532, "num_tokens": 157108.0, "reward": 0.32375001907348633, "reward_std": 0.5761679410934448, "rewards/reward_func/mean": 0.32375001907348633, "rewards/reward_func/std": 0.5525767803192139, "sampling/importance_sampling_ratio/max": 1.2767354249954224, "sampling/importance_sampling_ratio/mean": 0.8917201161384583, "sampling/importance_sampling_ratio/min": 0.5755601525306702, "sampling/sampling_logp_difference/max": 0.4100228548049927, "sampling/sampling_logp_difference/mean": 0.021200813353061676, "step": 28, "step_time": 109.97910062700976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3767409026622772, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 1.2156670093536377, "kl": 0.0015971511602401733, "learning_rate": 4.666666666666667e-06, "loss": 0.0163, "num_tokens": 163323.0, "reward": 0.08124999701976776, "reward_std": 0.2750605642795563, "rewards/reward_func/mean": 0.08124999701976776, "rewards/reward_func/std": 0.35746878385543823, "sampling/importance_sampling_ratio/max": 1.3450591564178467, "sampling/importance_sampling_ratio/mean": 1.031332015991211, "sampling/importance_sampling_ratio/min": 0.5739972591400146, "sampling/sampling_logp_difference/max": 0.20969057083129883, "sampling/sampling_logp_difference/mean": 0.018845085054636, "step": 29, "step_time": 125.90746463602409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3361830711364746, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 2.085590362548828, "kl": 0.002051064744591713, "learning_rate": 4.833333333333333e-06, "loss": -0.1044, "num_tokens": 169007.0, "reward": 0.11124999821186066, "reward_std": 0.2608071267604828, "rewards/reward_func/mean": 0.11124999821186066, "rewards/reward_func/std": 0.3598586320877075, "sampling/importance_sampling_ratio/max": 1.6862311363220215, "sampling/importance_sampling_ratio/mean": 0.9613958597183228, "sampling/importance_sampling_ratio/min": 0.4625941514968872, "sampling/sampling_logp_difference/max": 0.6341955661773682, "sampling/sampling_logp_difference/mean": 0.023160353302955627, "step": 30, "step_time": 116.55447733099572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.34975507855415344, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.7752349376678467, "kl": 0.0018625394441187382, "learning_rate": 5e-06, "loss": -0.0863, "num_tokens": 175255.0, "reward": 0.05000000447034836, "reward_std": 0.28021717071533203, "rewards/reward_func/mean": 0.05000000447034836, "rewards/reward_func/std": 0.3846333920955658, "sampling/importance_sampling_ratio/max": 1.1800814867019653, "sampling/importance_sampling_ratio/mean": 0.7340657711029053, "sampling/importance_sampling_ratio/min": 0.3828251361846924, "sampling/sampling_logp_difference/max": 0.574752688407898, "sampling/sampling_logp_difference/mean": 0.022265031933784485, "step": 31, "step_time": 125.58116400899598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.33510199189186096, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 1.3970941305160522, "kl": 0.002852437552064657, "learning_rate": 4.99998688809149e-06, "loss": 0.0298, "num_tokens": 180203.0, "reward": 0.21125000715255737, "reward_std": 0.5205552577972412, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.48203253746032715, "sampling/importance_sampling_ratio/max": 1.4200493097305298, "sampling/importance_sampling_ratio/mean": 0.8962746858596802, "sampling/importance_sampling_ratio/min": 0.506354570388794, "sampling/sampling_logp_difference/max": 0.3653395175933838, "sampling/sampling_logp_difference/mean": 0.018153443932533264, "step": 32, "step_time": 99.66716593201272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.359342485666275, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.8919492363929749, "kl": 0.0016850470565259457, "learning_rate": 4.9999475525034974e-06, "loss": -0.0118, "num_tokens": 185921.0, "reward": 0.1887499988079071, "reward_std": 0.5218685865402222, "rewards/reward_func/mean": 0.1887499988079071, "rewards/reward_func/std": 0.4834529757499695, "sampling/importance_sampling_ratio/max": 1.193926453590393, "sampling/importance_sampling_ratio/mean": 0.886371374130249, "sampling/importance_sampling_ratio/min": 0.6291231513023376, "sampling/sampling_logp_difference/max": 0.6103978157043457, "sampling/sampling_logp_difference/mean": 0.022488413378596306, "step": 33, "step_time": 107.37048736499855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3850080370903015, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 2.457120180130005, "kl": 0.002752592321485281, "learning_rate": 4.999881993648633e-06, "loss": -0.1605, "num_tokens": 191340.0, "reward": 0.33250001072883606, "reward_std": 0.5519619584083557, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.5346227288246155, "sampling/importance_sampling_ratio/max": 2.824227809906006, "sampling/importance_sampling_ratio/mean": 1.2650679349899292, "sampling/importance_sampling_ratio/min": 0.5782744288444519, "sampling/sampling_logp_difference/max": 0.5304313898086548, "sampling/sampling_logp_difference/mean": 0.026629671454429626, "step": 34, "step_time": 124.5775852559891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3793644905090332, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 1.1844000816345215, "kl": 0.0018589177634567022, "learning_rate": 4.99979021221458e-06, "loss": 0.1138, "num_tokens": 197242.0, "reward": 0.20374999940395355, "reward_std": 0.3106112480163574, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.48269888758659363, "sampling/importance_sampling_ratio/max": 1.8714232444763184, "sampling/importance_sampling_ratio/mean": 0.8821603059768677, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7558255195617676, "sampling/sampling_logp_difference/mean": 0.02827462926506996, "step": 35, "step_time": 124.59522388697951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3386607766151428, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 1.0467987060546875, "kl": 0.002486670855432749, "learning_rate": 4.9996722091640805e-06, "loss": -0.0844, "num_tokens": 202103.0, "reward": 0.7137500047683716, "reward_std": 0.31673291325569153, "rewards/reward_func/mean": 0.7137500047683716, "rewards/reward_func/std": 0.4965578019618988, "sampling/importance_sampling_ratio/max": 1.1834334135055542, "sampling/importance_sampling_ratio/mean": 0.8062876462936401, "sampling/importance_sampling_ratio/min": 0.3481108844280243, "sampling/sampling_logp_difference/max": 0.5823209285736084, "sampling/sampling_logp_difference/mean": 0.027472082525491714, "step": 36, "step_time": 95.7645532739989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.35707682371139526, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.8499080538749695, "kl": 0.00230376492254436, "learning_rate": 4.999527985734932e-06, "loss": 0.0658, "num_tokens": 207849.0, "reward": 0.3112500011920929, "reward_std": 0.5869807004928589, "rewards/reward_func/mean": 0.3112500011920929, "rewards/reward_func/std": 0.5547313094139099, "sampling/importance_sampling_ratio/max": 1.3937541246414185, "sampling/importance_sampling_ratio/mean": 0.9204949140548706, "sampling/importance_sampling_ratio/min": 0.5516513586044312, "sampling/sampling_logp_difference/max": 0.340686559677124, "sampling/sampling_logp_difference/mean": 0.02302435413002968, "step": 37, "step_time": 84.15662719498505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3510167896747589, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 1.4321602582931519, "kl": 0.002203675452619791, "learning_rate": 4.999357543439969e-06, "loss": -0.251, "num_tokens": 213602.0, "reward": 0.3187499940395355, "reward_std": 0.5740761756896973, "rewards/reward_func/mean": 0.3187499940395355, "rewards/reward_func/std": 0.5478904247283936, "sampling/importance_sampling_ratio/max": 1.6841275691986084, "sampling/importance_sampling_ratio/mean": 0.862945556640625, "sampling/importance_sampling_ratio/min": 0.3341965675354004, "sampling/sampling_logp_difference/max": 0.4191019535064697, "sampling/sampling_logp_difference/mean": 0.023331163451075554, "step": 38, "step_time": 96.54629640298663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3528852164745331, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 1.0844610929489136, "kl": 0.0035500035155564547, "learning_rate": 4.999160884067051e-06, "loss": 0.0473, "num_tokens": 219224.0, "reward": 0.08124999701976776, "reward_std": 0.27454516291618347, "rewards/reward_func/mean": 0.08124999701976776, "rewards/reward_func/std": 0.3577883243560791, "sampling/importance_sampling_ratio/max": 1.6339404582977295, "sampling/importance_sampling_ratio/mean": 0.916239857673645, "sampling/importance_sampling_ratio/min": 0.5048863291740417, "sampling/sampling_logp_difference/max": 0.4355291724205017, "sampling/sampling_logp_difference/mean": 0.02792040817439556, "step": 39, "step_time": 90.11714809801197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35738155245780945, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.9739111065864563, "kl": 0.010244293138384819, "learning_rate": 4.9989380096790416e-06, "loss": 0.0651, "num_tokens": 225224.0, "reward": 0.057500001043081284, "reward_std": 0.262703001499176, "rewards/reward_func/mean": 0.057500001043081284, "rewards/reward_func/std": 0.32779568433761597, "sampling/importance_sampling_ratio/max": 0.9581937193870544, "sampling/importance_sampling_ratio/mean": 0.7411354184150696, "sampling/importance_sampling_ratio/min": 0.6077343821525574, "sampling/sampling_logp_difference/max": 0.4662892818450928, "sampling/sampling_logp_difference/mean": 0.027016079053282738, "step": 40, "step_time": 115.12962955801049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.358026921749115, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.9497724175453186, "kl": 0.0010761432349681854, "learning_rate": 4.998688922613788e-06, "loss": -0.1294, "num_tokens": 230706.0, "reward": 0.09750000387430191, "reward_std": 0.268343985080719, "rewards/reward_func/mean": 0.09750000387430191, "rewards/reward_func/std": 0.36311155557632446, "sampling/importance_sampling_ratio/max": 1.3778512477874756, "sampling/importance_sampling_ratio/mean": 0.8394644260406494, "sampling/importance_sampling_ratio/min": 0.5254734754562378, "sampling/sampling_logp_difference/max": 0.34768080711364746, "sampling/sampling_logp_difference/mean": 0.02200084924697876, "step": 41, "step_time": 105.99199089498143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3547409176826477, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.8008268475532532, "kl": 0.002836492843925953, "learning_rate": 4.998413625484095e-06, "loss": 0.0157, "num_tokens": 235797.0, "reward": 0.1837500035762787, "reward_std": 0.49929410219192505, "rewards/reward_func/mean": 0.1837500035762787, "rewards/reward_func/std": 0.46315494179725647, "sampling/importance_sampling_ratio/max": 1.5412955284118652, "sampling/importance_sampling_ratio/mean": 0.9024899005889893, "sampling/importance_sampling_ratio/min": 0.4405742287635803, "sampling/sampling_logp_difference/max": 0.32985711097717285, "sampling/sampling_logp_difference/mean": 0.022187065333127975, "step": 42, "step_time": 94.88906268199207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.34511110186576843, "epoch": 0.086, "frac_reward_zero_std": 0.0, "grad_norm": 2.284348964691162, "kl": 0.006113796960562468, "learning_rate": 4.9981121211777e-06, "loss": 0.2878, "num_tokens": 242012.0, "reward": 0.33125001192092896, "reward_std": 0.27656540274620056, "rewards/reward_func/mean": 0.33125001192092896, "rewards/reward_func/std": 0.5243346095085144, "sampling/importance_sampling_ratio/max": 2.3226945400238037, "sampling/importance_sampling_ratio/mean": 0.8612687587738037, "sampling/importance_sampling_ratio/min": 0.3401707410812378, "sampling/sampling_logp_difference/max": 0.6737399101257324, "sampling/sampling_logp_difference/mean": 0.02680300548672676, "step": 43, "step_time": 83.10613166898838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.35635292530059814, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.8651462197303772, "kl": 0.0018102331086993217, "learning_rate": 4.997784412857239e-06, "loss": 0.1933, "num_tokens": 248290.0, "reward": 0.08749999105930328, "reward_std": 0.26158273220062256, "rewards/reward_func/mean": 0.08749999105930328, "rewards/reward_func/std": 0.3502958118915558, "sampling/importance_sampling_ratio/max": 1.2505582571029663, "sampling/importance_sampling_ratio/mean": 0.8515357971191406, "sampling/importance_sampling_ratio/min": 0.3733709156513214, "sampling/sampling_logp_difference/max": 0.3616971969604492, "sampling/sampling_logp_difference/mean": 0.023276425898075104, "step": 44, "step_time": 96.6780000999861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 57.875, "completions/mean_terminated_length": 57.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.4258785843849182, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.7754176259040833, "kl": 0.005309364292770624, "learning_rate": 4.99743050396022e-06, "loss": 0.0623, "num_tokens": 253843.0, "reward": 0.3537500202655792, "reward_std": 0.5509142875671387, "rewards/reward_func/mean": 0.3537500202655792, "rewards/reward_func/std": 0.5298500657081604, "sampling/importance_sampling_ratio/max": 1.3845423460006714, "sampling/importance_sampling_ratio/mean": 0.8699455261230469, "sampling/importance_sampling_ratio/min": 0.30967551469802856, "sampling/sampling_logp_difference/max": 0.4125208854675293, "sampling/sampling_logp_difference/mean": 0.02934259921312332, "step": 45, "step_time": 78.66523612500168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.37021562457084656, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 1.3564432859420776, "kl": 0.002915932796895504, "learning_rate": 4.997050398198977e-06, "loss": 0.0626, "num_tokens": 258896.0, "reward": 0.33000001311302185, "reward_std": 0.5612866878509521, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5336666107177734, "sampling/importance_sampling_ratio/max": 1.5622109174728394, "sampling/importance_sampling_ratio/mean": 1.0067038536071777, "sampling/importance_sampling_ratio/min": 0.41681596636772156, "sampling/sampling_logp_difference/max": 0.5778782367706299, "sampling/sampling_logp_difference/mean": 0.024849699810147285, "step": 46, "step_time": 91.13322191301268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3771839439868927, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 1.5632238388061523, "kl": 0.0020723105408251286, "learning_rate": 4.9966440995606415e-06, "loss": -0.192, "num_tokens": 264298.0, "reward": 0.20499999821186066, "reward_std": 0.32124459743499756, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.4859159588813782, "sampling/importance_sampling_ratio/max": 2.447948694229126, "sampling/importance_sampling_ratio/mean": 1.2227914333343506, "sampling/importance_sampling_ratio/min": 0.46755385398864746, "sampling/sampling_logp_difference/max": 0.3898049592971802, "sampling/sampling_logp_difference/mean": 0.023719076067209244, "step": 47, "step_time": 77.2298025219934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 41.375, "completions/mean_terminated_length": 41.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3675180673599243, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 1.202669620513916, "kl": 0.003329810919240117, "learning_rate": 4.9962116123070925e-06, "loss": 0.1727, "num_tokens": 269970.0, "reward": 0.3412500023841858, "reward_std": 0.5402647256851196, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.5210549235343933, "sampling/importance_sampling_ratio/max": 2.6151790618896484, "sampling/importance_sampling_ratio/mean": 0.9013060331344604, "sampling/importance_sampling_ratio/min": 0.20561860501766205, "sampling/sampling_logp_difference/max": 0.5823161602020264, "sampling/sampling_logp_difference/mean": 0.03182598948478699, "step": 48, "step_time": 103.054377449007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3362388014793396, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.9811183214187622, "kl": 0.001854179659858346, "learning_rate": 4.9957529409749185e-06, "loss": -0.0427, "num_tokens": 275532.0, "reward": 0.2212499976158142, "reward_std": 0.5194467306137085, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.48111292719841003, "sampling/importance_sampling_ratio/max": 1.1795125007629395, "sampling/importance_sampling_ratio/mean": 0.8117722868919373, "sampling/importance_sampling_ratio/min": 0.32398006319999695, "sampling/sampling_logp_difference/max": 0.3128964900970459, "sampling/sampling_logp_difference/mean": 0.020228173583745956, "step": 49, "step_time": 92.03725726599805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33680054545402527, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.8777604699134827, "kl": 0.0024415755178779364, "learning_rate": 4.995268090375362e-06, "loss": 0.05, "num_tokens": 281466.0, "reward": 0.0637500062584877, "reward_std": 0.27558743953704834, "rewards/reward_func/mean": 0.0637500062584877, "rewards/reward_func/std": 0.3678484857082367, "sampling/importance_sampling_ratio/max": 1.50518000125885, "sampling/importance_sampling_ratio/mean": 0.9082848429679871, "sampling/importance_sampling_ratio/min": 0.41154056787490845, "sampling/sampling_logp_difference/max": 0.5747478008270264, "sampling/sampling_logp_difference/mean": 0.024731453508138657, "step": 50, "step_time": 101.29344615599257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.41207462549209595, "epoch": 0.102, "frac_reward_zero_std": 0.0, "grad_norm": 1.1961466073989868, "kl": 0.0038733931723982096, "learning_rate": 4.99475706559428e-06, "loss": 0.0661, "num_tokens": 286937.0, "reward": 0.45500001311302185, "reward_std": 0.5954470634460449, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.5518540143966675, "sampling/importance_sampling_ratio/max": 1.4747618436813354, "sampling/importance_sampling_ratio/mean": 0.934749960899353, "sampling/importance_sampling_ratio/min": 0.4552203118801117, "sampling/sampling_logp_difference/max": 0.3542771339416504, "sampling/sampling_logp_difference/mean": 0.023817723616957664, "step": 51, "step_time": 98.44445793100749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.31511616706848145, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 1.32075035572052, "kl": 0.0016216668300330639, "learning_rate": 4.994219871992077e-06, "loss": -0.0975, "num_tokens": 292284.0, "reward": 0.6025000214576721, "reward_std": 0.2593764066696167, "rewards/reward_func/mean": 0.6025000214576721, "rewards/reward_func/std": 0.5323197245597839, "sampling/importance_sampling_ratio/max": 2.0314249992370605, "sampling/importance_sampling_ratio/mean": 1.274023413658142, "sampling/importance_sampling_ratio/min": 0.5603557229042053, "sampling/sampling_logp_difference/max": 0.7182197570800781, "sampling/sampling_logp_difference/mean": 0.021842751652002335, "step": 52, "step_time": 74.32618405998801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 57.875, "completions/mean_terminated_length": 57.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.37222176790237427, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.8657234311103821, "kl": 0.004503990523517132, "learning_rate": 4.993656515203662e-06, "loss": -0.0014, "num_tokens": 298374.0, "reward": 0.17249998450279236, "reward_std": 0.3328244686126709, "rewards/reward_func/mean": 0.17249998450279236, "rewards/reward_func/std": 0.4848784804344177, "sampling/importance_sampling_ratio/max": 1.6778643131256104, "sampling/importance_sampling_ratio/mean": 0.9224530458450317, "sampling/importance_sampling_ratio/min": 0.3226885199546814, "sampling/sampling_logp_difference/max": 1.1129628419876099, "sampling/sampling_logp_difference/mean": 0.023930778726935387, "step": 53, "step_time": 105.75805833100458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.38617652654647827, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.9701952934265137, "kl": 0.002225311938673258, "learning_rate": 4.99306700113838e-06, "loss": -0.0465, "num_tokens": 303786.0, "reward": 0.33000001311302185, "reward_std": 0.5561413764953613, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5339342355728149, "sampling/importance_sampling_ratio/max": 1.0194271802902222, "sampling/importance_sampling_ratio/mean": 0.7991744875907898, "sampling/importance_sampling_ratio/min": 0.33269256353378296, "sampling/sampling_logp_difference/max": 0.33765721321105957, "sampling/sampling_logp_difference/mean": 0.026410941034555435, "step": 54, "step_time": 83.15403410801082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3604234457015991, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.8578059077262878, "kl": 0.0011407495476305485, "learning_rate": 4.9924513359799555e-06, "loss": 0.0794, "num_tokens": 309275.0, "reward": 0.33250001072883606, "reward_std": 0.5635805130004883, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.5450229048728943, "sampling/importance_sampling_ratio/max": 1.3248019218444824, "sampling/importance_sampling_ratio/mean": 0.8565744161605835, "sampling/importance_sampling_ratio/min": 0.42319124937057495, "sampling/sampling_logp_difference/max": 0.5236988067626953, "sampling/sampling_logp_difference/mean": 0.022501163184642792, "step": 55, "step_time": 73.6418833520147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3908393681049347, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 2.7073943614959717, "kl": 0.003342224285006523, "learning_rate": 4.991809526186424e-06, "loss": -0.2814, "num_tokens": 314307.0, "reward": 0.32375001907348633, "reward_std": 0.5598282814025879, "rewards/reward_func/mean": 0.32375001907348633, "rewards/reward_func/std": 0.5455518364906311, "sampling/importance_sampling_ratio/max": 2.344586133956909, "sampling/importance_sampling_ratio/mean": 1.196304440498352, "sampling/importance_sampling_ratio/min": 0.5245997309684753, "sampling/sampling_logp_difference/max": 1.1410305500030518, "sampling/sampling_logp_difference/mean": 0.02790486253798008, "step": 56, "step_time": 90.67011975299101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.34012845158576965, "epoch": 0.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.8670101761817932, "kl": 0.0026079914532601833, "learning_rate": 4.991141578490066e-06, "loss": -0.1396, "num_tokens": 320264.0, "reward": 0.07624999433755875, "reward_std": 0.29174044728279114, "rewards/reward_func/mean": 0.07624999433755875, "rewards/reward_func/std": 0.37625741958618164, "sampling/importance_sampling_ratio/max": 1.4562015533447266, "sampling/importance_sampling_ratio/mean": 0.7509514689445496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.34556615352630615, "sampling/sampling_logp_difference/mean": 0.02316705882549286, "step": 57, "step_time": 96.33522103502764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3367074131965637, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 1.0496222972869873, "kl": 0.0054735890589654446, "learning_rate": 4.990447499897339e-06, "loss": 0.1168, "num_tokens": 325728.0, "reward": 0.17374999821186066, "reward_std": 0.538986086845398, "rewards/reward_func/mean": 0.17374999821186066, "rewards/reward_func/std": 0.49951228499412537, "sampling/importance_sampling_ratio/max": 1.447581171989441, "sampling/importance_sampling_ratio/mean": 0.9637683629989624, "sampling/importance_sampling_ratio/min": 0.6208034157752991, "sampling/sampling_logp_difference/max": 0.4196118116378784, "sampling/sampling_logp_difference/mean": 0.023549657315015793, "step": 58, "step_time": 97.37826122099068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.875, "completions/mean_terminated_length": 49.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.36870628595352173, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3209524154663086, "kl": 0.002333354204893112, "learning_rate": 4.989727297688797e-06, "loss": 0.1813, "num_tokens": 331317.0, "reward": 0.3449999988079071, "reward_std": 0.5488909482955933, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5325947403907776, "sampling/importance_sampling_ratio/max": 1.9513617753982544, "sampling/importance_sampling_ratio/mean": 0.996979296207428, "sampling/importance_sampling_ratio/min": 0.6756687760353088, "sampling/sampling_logp_difference/max": 0.4977457523345947, "sampling/sampling_logp_difference/mean": 0.02654324471950531, "step": 59, "step_time": 92.74966769100865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3398672938346863, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.9856535792350769, "kl": 0.0027454416267573833, "learning_rate": 4.98898097941902e-06, "loss": 0.0051, "num_tokens": 336444.0, "reward": -0.06875000149011612, "reward_std": 0.06034637242555618, "rewards/reward_func/mean": -0.06875000149011612, "rewards/reward_func/std": 0.05667892098426819, "sampling/importance_sampling_ratio/max": 1.05806303024292, "sampling/importance_sampling_ratio/mean": 0.7077381014823914, "sampling/importance_sampling_ratio/min": 0.29610589146614075, "sampling/sampling_logp_difference/max": 0.538194477558136, "sampling/sampling_logp_difference/mean": 0.026932962238788605, "step": 60, "step_time": 85.23989409799105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3458422124385834, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.7404291033744812, "kl": 0.0024542706087231636, "learning_rate": 4.988208552916535e-06, "loss": 0.0047, "num_tokens": 341913.0, "reward": 0.06750001013278961, "reward_std": 0.2946765422821045, "rewards/reward_func/mean": 0.06750001013278961, "rewards/reward_func/std": 0.38231438398361206, "sampling/importance_sampling_ratio/max": 1.2514207363128662, "sampling/importance_sampling_ratio/mean": 0.8571313619613647, "sampling/importance_sampling_ratio/min": 0.37189623713493347, "sampling/sampling_logp_difference/max": 0.32369494438171387, "sampling/sampling_logp_difference/mean": 0.01881476677954197, "step": 61, "step_time": 81.40806046701618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3777502477169037, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 1.5111892223358154, "kl": 0.0038244668394327164, "learning_rate": 4.98741002628373e-06, "loss": -0.0188, "num_tokens": 347236.0, "reward": 0.6025000214576721, "reward_std": 0.5539374351501465, "rewards/reward_func/mean": 0.6025000214576721, "rewards/reward_func/std": 0.5297641158103943, "sampling/importance_sampling_ratio/max": 2.0949490070343018, "sampling/importance_sampling_ratio/mean": 1.2000421285629272, "sampling/importance_sampling_ratio/min": 0.6139946579933167, "sampling/sampling_logp_difference/max": 0.6703405380249023, "sampling/sampling_logp_difference/mean": 0.02322327345609665, "step": 62, "step_time": 69.72723935198155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3558458089828491, "epoch": 0.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.667698085308075, "kl": 0.0018400326371192932, "learning_rate": 4.9865854078967715e-06, "loss": 0.0829, "num_tokens": 352685.0, "reward": 0.5875000357627869, "reward_std": 0.5716196298599243, "rewards/reward_func/mean": 0.5875000357627869, "rewards/reward_func/std": 0.5452063679695129, "sampling/importance_sampling_ratio/max": 0.9193384647369385, "sampling/importance_sampling_ratio/mean": 0.632682740688324, "sampling/importance_sampling_ratio/min": 0.3342023193836212, "sampling/sampling_logp_difference/max": 0.8547244071960449, "sampling/sampling_logp_difference/mean": 0.022451236844062805, "step": 63, "step_time": 64.33397425999283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.36075106263160706, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 1.6670936346054077, "kl": 0.0064406488090753555, "learning_rate": 4.985734706405516e-06, "loss": -0.1783, "num_tokens": 358643.0, "reward": 0.08750000596046448, "reward_std": 0.2707710862159729, "rewards/reward_func/mean": 0.08750000596046448, "rewards/reward_func/std": 0.3586781620979309, "sampling/importance_sampling_ratio/max": 2.078122615814209, "sampling/importance_sampling_ratio/mean": 1.245069980621338, "sampling/importance_sampling_ratio/min": 0.5846289992332458, "sampling/sampling_logp_difference/max": 0.3692970275878906, "sampling/sampling_logp_difference/mean": 0.022641174495220184, "step": 64, "step_time": 90.54826116497861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34251487255096436, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.6975246071815491, "kl": 0.0017276068683713675, "learning_rate": 4.9848579307334195e-06, "loss": 0.0456, "num_tokens": 365099.0, "reward": 0.2199999988079071, "reward_std": 0.4802777171134949, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.44468289613723755, "sampling/importance_sampling_ratio/max": 1.202215313911438, "sampling/importance_sampling_ratio/mean": 0.8249953985214233, "sampling/importance_sampling_ratio/min": 0.5363028645515442, "sampling/sampling_logp_difference/max": 0.45500755310058594, "sampling/sampling_logp_difference/mean": 0.01933646947145462, "step": 65, "step_time": 85.8507220740139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.35193923115730286, "epoch": 0.132, "frac_reward_zero_std": 0.0, "grad_norm": 1.0195058584213257, "kl": 0.0025058817118406296, "learning_rate": 4.983955090077445e-06, "loss": -0.0506, "num_tokens": 369968.0, "reward": 0.20749999582767487, "reward_std": 0.5109157562255859, "rewards/reward_func/mean": 0.20749999582767487, "rewards/reward_func/std": 0.4736107587814331, "sampling/importance_sampling_ratio/max": 1.2251256704330444, "sampling/importance_sampling_ratio/mean": 0.9472914934158325, "sampling/importance_sampling_ratio/min": 0.8227061629295349, "sampling/sampling_logp_difference/max": 0.31458473205566406, "sampling/sampling_logp_difference/mean": 0.017272518947720528, "step": 66, "step_time": 76.18023397601792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.36691945791244507, "epoch": 0.134, "frac_reward_zero_std": 0.0, "grad_norm": 1.0082647800445557, "kl": 0.0018996293656527996, "learning_rate": 4.983026193907962e-06, "loss": 0.1843, "num_tokens": 375164.0, "reward": 0.21249999105930328, "reward_std": 0.319685161113739, "rewards/reward_func/mean": 0.21249999105930328, "rewards/reward_func/std": 0.4872591197490692, "sampling/importance_sampling_ratio/max": 1.7669250965118408, "sampling/importance_sampling_ratio/mean": 0.8863449692726135, "sampling/importance_sampling_ratio/min": 0.26863494515419006, "sampling/sampling_logp_difference/max": 0.3789827823638916, "sampling/sampling_logp_difference/mean": 0.02467949688434601, "step": 67, "step_time": 93.1446157169994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.35669732093811035, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.8750259280204773, "kl": 0.0033818050287663937, "learning_rate": 4.982071251968653e-06, "loss": 0.051, "num_tokens": 380504.0, "reward": 0.32875001430511475, "reward_std": 0.5622336864471436, "rewards/reward_func/mean": 0.32875001430511475, "rewards/reward_func/std": 0.5360020399093628, "sampling/importance_sampling_ratio/max": 1.2201801538467407, "sampling/importance_sampling_ratio/mean": 0.8601148724555969, "sampling/importance_sampling_ratio/min": 0.6346798539161682, "sampling/sampling_logp_difference/max": 0.3918271064758301, "sampling/sampling_logp_difference/mean": 0.023806363344192505, "step": 68, "step_time": 72.7238112029736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.4199118912220001, "epoch": 0.138, "frac_reward_zero_std": 0.0, "grad_norm": 1.0077135562896729, "kl": 0.00302119180560112, "learning_rate": 4.981090274276406e-06, "loss": 0.1156, "num_tokens": 386315.0, "reward": 0.059999994933605194, "reward_std": 0.2796437740325928, "rewards/reward_func/mean": 0.059999994933605194, "rewards/reward_func/std": 0.36245197057724, "sampling/importance_sampling_ratio/max": 1.8291817903518677, "sampling/importance_sampling_ratio/mean": 0.9294091463088989, "sampling/importance_sampling_ratio/min": 0.3144456446170807, "sampling/sampling_logp_difference/max": 0.610379695892334, "sampling/sampling_logp_difference/mean": 0.031378112733364105, "step": 69, "step_time": 88.02450225598295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.38176238536834717, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.892582356929779, "kl": 0.0031344564631581306, "learning_rate": 4.980083271121215e-06, "loss": -0.1972, "num_tokens": 391929.0, "reward": 0.2150000035762787, "reward_std": 0.50013267993927, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.46398892998695374, "sampling/importance_sampling_ratio/max": 1.8030683994293213, "sampling/importance_sampling_ratio/mean": 0.9491258859634399, "sampling/importance_sampling_ratio/min": 0.3493870496749878, "sampling/sampling_logp_difference/max": 0.4494798183441162, "sampling/sampling_logp_difference/mean": 0.02591659128665924, "step": 70, "step_time": 71.29640350298723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.4286239743232727, "epoch": 0.142, "frac_reward_zero_std": 0.0, "grad_norm": 1.2103837728500366, "kl": 0.0030327460262924433, "learning_rate": 4.979050253066064e-06, "loss": -0.1033, "num_tokens": 397951.0, "reward": 0.1875, "reward_std": 0.344761461019516, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.48029011487960815, "sampling/importance_sampling_ratio/max": 1.681307315826416, "sampling/importance_sampling_ratio/mean": 0.8077924847602844, "sampling/importance_sampling_ratio/min": 0.35340648889541626, "sampling/sampling_logp_difference/max": 0.5255258083343506, "sampling/sampling_logp_difference/mean": 0.02561108022928238, "step": 71, "step_time": 94.12945175101049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3729614019393921, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.875525951385498, "kl": 0.001732141012325883, "learning_rate": 4.977991230946824e-06, "loss": -0.0218, "num_tokens": 402966.0, "reward": 0.34375, "reward_std": 0.5594232082366943, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5332096219062805, "sampling/importance_sampling_ratio/max": 1.3324414491653442, "sampling/importance_sampling_ratio/mean": 0.9489821195602417, "sampling/importance_sampling_ratio/min": 0.6200194358825684, "sampling/sampling_logp_difference/max": 0.282620906829834, "sampling/sampling_logp_difference/mean": 0.021304737776517868, "step": 72, "step_time": 68.7971295939933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.32240110635757446, "epoch": 0.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.950322151184082, "kl": 0.0028352453373372555, "learning_rate": 4.976906215872137e-06, "loss": 0.0467, "num_tokens": 409055.0, "reward": 0.3387500047683716, "reward_std": 0.2871127426624298, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5407254099845886, "sampling/importance_sampling_ratio/max": 1.2006139755249023, "sampling/importance_sampling_ratio/mean": 0.8248869180679321, "sampling/importance_sampling_ratio/min": 0.4684114456176758, "sampling/sampling_logp_difference/max": 0.43263185024261475, "sampling/sampling_logp_difference/mean": 0.023945681750774384, "step": 73, "step_time": 102.22122251600376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3135406970977783, "epoch": 0.148, "frac_reward_zero_std": 0.0, "grad_norm": 1.154463529586792, "kl": 0.03127220273017883, "learning_rate": 4.975795219223299e-06, "loss": -0.0935, "num_tokens": 414402.0, "reward": 0.3412500023841858, "reward_std": 0.5579792261123657, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.535975456237793, "sampling/importance_sampling_ratio/max": 2.067894220352173, "sampling/importance_sampling_ratio/mean": 0.9438801407814026, "sampling/importance_sampling_ratio/min": 0.4140065908432007, "sampling/sampling_logp_difference/max": 0.4713999032974243, "sampling/sampling_logp_difference/mean": 0.02523641288280487, "step": 74, "step_time": 89.00602476199856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.32595470547676086, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.8300859332084656, "kl": 0.007171849254518747, "learning_rate": 4.974658252654135e-06, "loss": -0.0902, "num_tokens": 419796.0, "reward": 0.48250001668930054, "reward_std": 0.5001860857009888, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5347295999526978, "sampling/importance_sampling_ratio/max": 1.4946962594985962, "sampling/importance_sampling_ratio/mean": 0.8154863119125366, "sampling/importance_sampling_ratio/min": 0.3788175582885742, "sampling/sampling_logp_difference/max": 0.7174708843231201, "sampling/sampling_logp_difference/mean": 0.021303167566657066, "step": 75, "step_time": 59.640716498019174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.41575515270233154, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 1.4510153532028198, "kl": 0.011087974533438683, "learning_rate": 4.973495328090891e-06, "loss": 0.1287, "num_tokens": 424726.0, "reward": 0.3387500047683716, "reward_std": 0.5444081425666809, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5208355784416199, "sampling/importance_sampling_ratio/max": 1.5896728038787842, "sampling/importance_sampling_ratio/mean": 1.0859061479568481, "sampling/importance_sampling_ratio/min": 0.718471348285675, "sampling/sampling_logp_difference/max": 0.6861748695373535, "sampling/sampling_logp_difference/mean": 0.02493377774953842, "step": 76, "step_time": 70.72388471697923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 42.875, "completions/mean_terminated_length": 42.875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3451007008552551, "epoch": 0.154, "frac_reward_zero_std": 0.0, "grad_norm": 1.7033120393753052, "kl": 0.006896092556416988, "learning_rate": 4.972306457732091e-06, "loss": 0.1233, "num_tokens": 429957.0, "reward": 0.09125000238418579, "reward_std": 0.2647804319858551, "rewards/reward_func/mean": 0.09125000238418579, "rewards/reward_func/std": 0.36841118335723877, "sampling/importance_sampling_ratio/max": 1.4747828245162964, "sampling/importance_sampling_ratio/mean": 1.0489879846572876, "sampling/importance_sampling_ratio/min": 0.6281050443649292, "sampling/sampling_logp_difference/max": 0.8355374336242676, "sampling/sampling_logp_difference/mean": 0.02908758632838726, "step": 77, "step_time": 81.86546373798046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3363872170448303, "epoch": 0.156, "frac_reward_zero_std": 0.0, "grad_norm": 1.0083401203155518, "kl": 0.003795074066147208, "learning_rate": 4.971091654048427e-06, "loss": 0.3044, "num_tokens": 436347.0, "reward": 0.07499999552965164, "reward_std": 0.2905214726924896, "rewards/reward_func/mean": 0.07499999552965164, "rewards/reward_func/std": 0.378644198179245, "sampling/importance_sampling_ratio/max": 2.2779664993286133, "sampling/importance_sampling_ratio/mean": 1.063035249710083, "sampling/importance_sampling_ratio/min": 0.37523871660232544, "sampling/sampling_logp_difference/max": 0.3972114324569702, "sampling/sampling_logp_difference/mean": 0.02419961616396904, "step": 78, "step_time": 88.4848685679899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3581341505050659, "epoch": 0.158, "frac_reward_zero_std": 0.0, "grad_norm": 1.572177767753601, "kl": 0.0030747244600206614, "learning_rate": 4.96985092978261e-06, "loss": -0.0336, "num_tokens": 441325.0, "reward": 0.051249999552965164, "reward_std": 0.3115207850933075, "rewards/reward_func/mean": 0.051249999552965164, "rewards/reward_func/std": 0.3852434754371643, "sampling/importance_sampling_ratio/max": 2.044938087463379, "sampling/importance_sampling_ratio/mean": 0.971229076385498, "sampling/importance_sampling_ratio/min": 0.1840338557958603, "sampling/sampling_logp_difference/max": 0.6103886365890503, "sampling/sampling_logp_difference/mean": 0.02984805777668953, "step": 79, "step_time": 92.6209842649987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.32628703117370605, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.8613694906234741, "kl": 0.0033719956409186125, "learning_rate": 4.968584297949255e-06, "loss": -0.0019, "num_tokens": 446935.0, "reward": 0.4662500023841858, "reward_std": 0.5965955257415771, "rewards/reward_func/mean": 0.4662500023841858, "rewards/reward_func/std": 0.5527060627937317, "sampling/importance_sampling_ratio/max": 2.1245856285095215, "sampling/importance_sampling_ratio/mean": 0.8101105690002441, "sampling/importance_sampling_ratio/min": 0.354059100151062, "sampling/sampling_logp_difference/max": 0.4604175090789795, "sampling/sampling_logp_difference/mean": 0.024325117468833923, "step": 80, "step_time": 91.05656213400653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 58.125, "completions/mean_terminated_length": 58.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3670700788497925, "epoch": 0.162, "frac_reward_zero_std": 0.0, "grad_norm": 1.2181848287582397, "kl": 0.002382858656346798, "learning_rate": 4.967291771834727e-06, "loss": -0.2348, "num_tokens": 452473.0, "reward": 0.15125001966953278, "reward_std": 0.3374948501586914, "rewards/reward_func/mean": 0.15125001966953278, "rewards/reward_func/std": 0.4997267425060272, "sampling/importance_sampling_ratio/max": 2.508380174636841, "sampling/importance_sampling_ratio/mean": 1.2941944599151611, "sampling/importance_sampling_ratio/min": 0.646767258644104, "sampling/sampling_logp_difference/max": 0.3313436508178711, "sampling/sampling_logp_difference/mean": 0.02512197196483612, "step": 81, "step_time": 90.79619163498865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2903425693511963, "epoch": 0.164, "frac_reward_zero_std": 0.0, "grad_norm": 1.2693836688995361, "kl": 0.0025903629139065742, "learning_rate": 4.965973364997015e-06, "loss": -0.0367, "num_tokens": 458523.0, "reward": 0.17125000059604645, "reward_std": 0.3282886743545532, "rewards/reward_func/mean": 0.17125000059604645, "rewards/reward_func/std": 0.499898225069046, "sampling/importance_sampling_ratio/max": 1.9814573526382446, "sampling/importance_sampling_ratio/mean": 0.9206903576850891, "sampling/importance_sampling_ratio/min": 0.2931478023529053, "sampling/sampling_logp_difference/max": 0.4033019542694092, "sampling/sampling_logp_difference/mean": 0.02239578776061535, "step": 82, "step_time": 101.29266464500688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3330921530723572, "epoch": 0.166, "frac_reward_zero_std": 0.0, "grad_norm": 1.2236757278442383, "kl": 0.003864692524075508, "learning_rate": 4.964629091265583e-06, "loss": -0.0728, "num_tokens": 463684.0, "reward": 0.4675000011920929, "reward_std": 0.5979688763618469, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.5541208982467651, "sampling/importance_sampling_ratio/max": 1.6764252185821533, "sampling/importance_sampling_ratio/mean": 1.0374202728271484, "sampling/importance_sampling_ratio/min": 0.6156142950057983, "sampling/sampling_logp_difference/max": 0.5306464433670044, "sampling/sampling_logp_difference/mean": 0.02434811368584633, "step": 83, "step_time": 65.75530088201049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3707743287086487, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 1.0121077299118042, "kl": 0.006942209787666798, "learning_rate": 4.963258964741227e-06, "loss": 0.1128, "num_tokens": 468918.0, "reward": 0.3462499976158142, "reward_std": 0.5688021183013916, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5434529185295105, "sampling/importance_sampling_ratio/max": 1.8691112995147705, "sampling/importance_sampling_ratio/mean": 0.9797255992889404, "sampling/importance_sampling_ratio/min": 0.19412098824977875, "sampling/sampling_logp_difference/max": 0.6592090129852295, "sampling/sampling_logp_difference/mean": 0.027863148599863052, "step": 84, "step_time": 69.84349327700329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.350742369890213, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 1.236136794090271, "kl": 0.0026309723034501076, "learning_rate": 4.961862999795923e-06, "loss": 0.0105, "num_tokens": 474878.0, "reward": 0.061250001192092896, "reward_std": 0.2900194525718689, "rewards/reward_func/mean": 0.061250001192092896, "rewards/reward_func/std": 0.3823774456977844, "sampling/importance_sampling_ratio/max": 2.504836320877075, "sampling/importance_sampling_ratio/mean": 1.2779099941253662, "sampling/importance_sampling_ratio/min": 0.65166175365448, "sampling/sampling_logp_difference/max": 0.5060451030731201, "sampling/sampling_logp_difference/mean": 0.021245911717414856, "step": 85, "step_time": 77.75256623400492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.36720341444015503, "epoch": 0.172, "frac_reward_zero_std": 0.0, "grad_norm": 1.3172496557235718, "kl": 0.0030011916533112526, "learning_rate": 4.960441211072686e-06, "loss": -0.1479, "num_tokens": 480065.0, "reward": 0.4399999976158142, "reward_std": 0.5658103227615356, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.5238865613937378, "sampling/importance_sampling_ratio/max": 2.6345438957214355, "sampling/importance_sampling_ratio/mean": 1.2061142921447754, "sampling/importance_sampling_ratio/min": 0.6962835192680359, "sampling/sampling_logp_difference/max": 0.3562922477722168, "sampling/sampling_logp_difference/mean": 0.023759279400110245, "step": 86, "step_time": 79.76976580297924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3295614421367645, "epoch": 0.174, "frac_reward_zero_std": 0.0, "grad_norm": 1.2377467155456543, "kl": 0.0017705978825688362, "learning_rate": 4.958993613485406e-06, "loss": 0.1347, "num_tokens": 485174.0, "reward": -0.05000000074505806, "reward_std": 0.03639974445104599, "rewards/reward_func/mean": -0.05000000074505806, "rewards/reward_func/std": 0.041403934359550476, "sampling/importance_sampling_ratio/max": 2.701768398284912, "sampling/importance_sampling_ratio/mean": 1.1716480255126953, "sampling/importance_sampling_ratio/min": 0.710328221321106, "sampling/sampling_logp_difference/max": 0.3339419364929199, "sampling/sampling_logp_difference/mean": 0.02282622456550598, "step": 87, "step_time": 98.58460397701128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3728080689907074, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.9409402012825012, "kl": 0.004020972643047571, "learning_rate": 4.957520222218695e-06, "loss": -0.2078, "num_tokens": 491186.0, "reward": 0.20624999701976776, "reward_std": 0.3007173538208008, "rewards/reward_func/mean": 0.20624999701976776, "rewards/reward_func/std": 0.4542478024959564, "sampling/importance_sampling_ratio/max": 1.1913342475891113, "sampling/importance_sampling_ratio/mean": 0.8587566018104553, "sampling/importance_sampling_ratio/min": 0.5435622930526733, "sampling/sampling_logp_difference/max": 0.33767926692962646, "sampling/sampling_logp_difference/mean": 0.02355227991938591, "step": 88, "step_time": 83.78908705202048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.41878965497016907, "epoch": 0.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.9201162457466125, "kl": 0.0033999462611973286, "learning_rate": 4.956021052727731e-06, "loss": -0.0817, "num_tokens": 497013.0, "reward": 0.09125000238418579, "reward_std": 0.2839151620864868, "rewards/reward_func/mean": 0.09125000238418579, "rewards/reward_func/std": 0.369727224111557, "sampling/importance_sampling_ratio/max": 1.4009984731674194, "sampling/importance_sampling_ratio/mean": 0.8987118005752563, "sampling/importance_sampling_ratio/min": 0.636340320110321, "sampling/sampling_logp_difference/max": 0.3343019485473633, "sampling/sampling_logp_difference/mean": 0.02563662827014923, "step": 89, "step_time": 76.07593618502142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3703194260597229, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 1.0612517595291138, "kl": 0.00556425005197525, "learning_rate": 4.954496120738094e-06, "loss": 0.1675, "num_tokens": 502431.0, "reward": 0.3462499976158142, "reward_std": 0.5637357234954834, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5389921069145203, "sampling/importance_sampling_ratio/max": 1.7256437540054321, "sampling/importance_sampling_ratio/mean": 0.9468981027603149, "sampling/importance_sampling_ratio/min": 0.606105387210846, "sampling/sampling_logp_difference/max": 0.343442440032959, "sampling/sampling_logp_difference/mean": 0.022628474980592728, "step": 90, "step_time": 75.86889905299176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3772880434989929, "epoch": 0.182, "frac_reward_zero_std": 0.0, "grad_norm": 1.3590164184570312, "kl": 0.0036642742343246937, "learning_rate": 4.952945442245598e-06, "loss": -0.2427, "num_tokens": 508352.0, "reward": 0.07499998807907104, "reward_std": 0.28367576003074646, "rewards/reward_func/mean": 0.07499998807907104, "rewards/reward_func/std": 0.3612280488014221, "sampling/importance_sampling_ratio/max": 1.6069527864456177, "sampling/importance_sampling_ratio/mean": 0.8528153300285339, "sampling/importance_sampling_ratio/min": 0.2771243453025818, "sampling/sampling_logp_difference/max": 0.6460120677947998, "sampling/sampling_logp_difference/mean": 0.026305314153432846, "step": 91, "step_time": 101.26391361499554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3880101442337036, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 2.07570219039917, "kl": 0.0035898014903068542, "learning_rate": 4.951369033516127e-06, "loss": -0.0628, "num_tokens": 513922.0, "reward": 0.3474999964237213, "reward_std": 0.538013756275177, "rewards/reward_func/mean": 0.3474999964237213, "rewards/reward_func/std": 0.5158834457397461, "sampling/importance_sampling_ratio/max": 2.46356201171875, "sampling/importance_sampling_ratio/mean": 1.4318658113479614, "sampling/importance_sampling_ratio/min": 0.7919402122497559, "sampling/sampling_logp_difference/max": 0.5595130920410156, "sampling/sampling_logp_difference/mean": 0.021545136347413063, "step": 92, "step_time": 89.50697983897408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3597927391529083, "epoch": 0.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.7946093082427979, "kl": 0.00549793615937233, "learning_rate": 4.949766911085461e-06, "loss": -0.0008, "num_tokens": 519677.0, "reward": 0.2150000035762787, "reward_std": 0.5236697196960449, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.48529812693595886, "sampling/importance_sampling_ratio/max": 1.5985785722732544, "sampling/importance_sampling_ratio/mean": 0.9824653267860413, "sampling/importance_sampling_ratio/min": 0.5499786734580994, "sampling/sampling_logp_difference/max": 0.40012407302856445, "sampling/sampling_logp_difference/mean": 0.020420320332050323, "step": 93, "step_time": 75.89820895600133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.37839484214782715, "epoch": 0.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.6853567957878113, "kl": 0.003214706666767597, "learning_rate": 4.948139091759108e-06, "loss": 0.2257, "num_tokens": 525806.0, "reward": 0.19624999165534973, "reward_std": 0.5386360883712769, "rewards/reward_func/mean": 0.19624999165534973, "rewards/reward_func/std": 0.4986822009086609, "sampling/importance_sampling_ratio/max": 2.103787422180176, "sampling/importance_sampling_ratio/mean": 1.0454094409942627, "sampling/importance_sampling_ratio/min": 0.4870206117630005, "sampling/sampling_logp_difference/max": 0.3358621597290039, "sampling/sampling_logp_difference/mean": 0.01956326514482498, "step": 94, "step_time": 85.72357106002164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3306207060813904, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 1.1758936643600464, "kl": 0.002643125131726265, "learning_rate": 4.946485592612122e-06, "loss": -0.1909, "num_tokens": 531520.0, "reward": 0.06624999642372131, "reward_std": 0.3067837357521057, "rewards/reward_func/mean": 0.06624999642372131, "rewards/reward_func/std": 0.38149845600128174, "sampling/importance_sampling_ratio/max": 1.9292004108428955, "sampling/importance_sampling_ratio/mean": 1.0738458633422852, "sampling/importance_sampling_ratio/min": 0.6487919688224792, "sampling/sampling_logp_difference/max": 0.3406977653503418, "sampling/sampling_logp_difference/mean": 0.0220349058508873, "step": 95, "step_time": 77.85138512399863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.312557190656662, "epoch": 0.192, "frac_reward_zero_std": 0.0, "grad_norm": 1.1386572122573853, "kl": 0.0068060653284192085, "learning_rate": 4.944806430988927e-06, "loss": -0.2, "num_tokens": 536890.0, "reward": 0.33250001072883606, "reward_std": 0.5673606991767883, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.5434480309486389, "sampling/importance_sampling_ratio/max": 1.4778003692626953, "sampling/importance_sampling_ratio/mean": 0.9165278673171997, "sampling/importance_sampling_ratio/min": 0.373668909072876, "sampling/sampling_logp_difference/max": 0.5655592679977417, "sampling/sampling_logp_difference/mean": 0.024791110306978226, "step": 96, "step_time": 62.66469675899134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3480789363384247, "epoch": 0.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.8446367979049683, "kl": 0.009365051984786987, "learning_rate": 4.943101624503133e-06, "loss": 0.1505, "num_tokens": 542424.0, "reward": -0.07750000059604645, "reward_std": 0.06912855058908463, "rewards/reward_func/mean": -0.07750000059604645, "rewards/reward_func/std": 0.06453128159046173, "sampling/importance_sampling_ratio/max": 1.3271279335021973, "sampling/importance_sampling_ratio/mean": 0.8955328464508057, "sampling/importance_sampling_ratio/min": 0.2967626452445984, "sampling/sampling_logp_difference/max": 0.4783933162689209, "sampling/sampling_logp_difference/mean": 0.025432758033275604, "step": 97, "step_time": 121.15107800901751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3962988257408142, "epoch": 0.196, "frac_reward_zero_std": 0.0, "grad_norm": 1.7532799243927002, "kl": 0.0025938255712389946, "learning_rate": 4.941371191037353e-06, "loss": 0.5137, "num_tokens": 548175.0, "reward": 0.09125000238418579, "reward_std": 0.27026599645614624, "rewards/reward_func/mean": 0.09125000238418579, "rewards/reward_func/std": 0.35750874876976013, "sampling/importance_sampling_ratio/max": 2.1303369998931885, "sampling/importance_sampling_ratio/mean": 1.1401922702789307, "sampling/importance_sampling_ratio/min": 0.46015465259552, "sampling/sampling_logp_difference/max": 0.5732070207595825, "sampling/sampling_logp_difference/mean": 0.028814753517508507, "step": 98, "step_time": 93.41499740499421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 46.875, "completions/mean_terminated_length": 46.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.36940494179725647, "epoch": 0.198, "frac_reward_zero_std": 0.0, "grad_norm": 1.0608575344085693, "kl": 0.003493869910016656, "learning_rate": 4.939615148743017e-06, "loss": -0.1552, "num_tokens": 553527.0, "reward": 0.20625001192092896, "reward_std": 0.5164605379104614, "rewards/reward_func/mean": 0.20625001192092896, "rewards/reward_func/std": 0.4783584475517273, "sampling/importance_sampling_ratio/max": 1.2723692655563354, "sampling/importance_sampling_ratio/mean": 0.8784043788909912, "sampling/importance_sampling_ratio/min": 0.5372451543807983, "sampling/sampling_logp_difference/max": 0.5341734886169434, "sampling/sampling_logp_difference/mean": 0.02111111767590046, "step": 99, "step_time": 90.65784694100148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3440898358821869, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 1.1309008598327637, "kl": 0.0054893046617507935, "learning_rate": 4.937833516040177e-06, "loss": 0.0197, "num_tokens": 559881.0, "reward": 0.29875001311302185, "reward_std": 0.5750788450241089, "rewards/reward_func/mean": 0.29875001311302185, "rewards/reward_func/std": 0.5545767545700073, "sampling/importance_sampling_ratio/max": 1.1451483964920044, "sampling/importance_sampling_ratio/mean": 0.8518111705780029, "sampling/importance_sampling_ratio/min": 0.6277573108673096, "sampling/sampling_logp_difference/max": 0.48022013902664185, "sampling/sampling_logp_difference/mean": 0.024066496640443802, "step": 100, "step_time": 103.06095111800823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3469806909561157, "epoch": 0.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.7262893319129944, "kl": 0.0019808816723525524, "learning_rate": 4.936026311617316e-06, "loss": 0.0341, "num_tokens": 565252.0, "reward": 0.040000006556510925, "reward_std": 0.3062291443347931, "rewards/reward_func/mean": 0.040000006556510925, "rewards/reward_func/std": 0.3904210329055786, "sampling/importance_sampling_ratio/max": 2.1688106060028076, "sampling/importance_sampling_ratio/mean": 1.0679666996002197, "sampling/importance_sampling_ratio/min": 0.3155788481235504, "sampling/sampling_logp_difference/max": 0.7815747261047363, "sampling/sampling_logp_difference/mean": 0.02425907365977764, "step": 101, "step_time": 103.25376764600514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34851688146591187, "epoch": 0.204, "frac_reward_zero_std": 0.0, "grad_norm": 1.0761581659317017, "kl": 0.006346363108605146, "learning_rate": 4.9341935544311536e-06, "loss": -0.0727, "num_tokens": 570076.0, "reward": 0.4737500250339508, "reward_std": 0.5936700105667114, "rewards/reward_func/mean": 0.4737500250339508, "rewards/reward_func/std": 0.5496476292610168, "sampling/importance_sampling_ratio/max": 1.647140622138977, "sampling/importance_sampling_ratio/mean": 1.0042234659194946, "sampling/importance_sampling_ratio/min": 0.49350976943969727, "sampling/sampling_logp_difference/max": 0.609084963798523, "sampling/sampling_logp_difference/mean": 0.025170542299747467, "step": 102, "step_time": 64.7651237519749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3427722454071045, "epoch": 0.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.7976966500282288, "kl": 0.0015114627312868834, "learning_rate": 4.932335263706446e-06, "loss": -0.1135, "num_tokens": 576043.0, "reward": 0.5950000286102295, "reward_std": 0.5730479955673218, "rewards/reward_func/mean": 0.5950000286102295, "rewards/reward_func/std": 0.5514914989471436, "sampling/importance_sampling_ratio/max": 1.248305082321167, "sampling/importance_sampling_ratio/mean": 0.944599986076355, "sampling/importance_sampling_ratio/min": 0.5915149450302124, "sampling/sampling_logp_difference/max": 0.2609410285949707, "sampling/sampling_logp_difference/mean": 0.019696425646543503, "step": 103, "step_time": 71.29194252597517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3026958703994751, "epoch": 0.208, "frac_reward_zero_std": 0.0, "grad_norm": 1.035447359085083, "kl": 0.002654898911714554, "learning_rate": 4.930451458935783e-06, "loss": -0.0804, "num_tokens": 580966.0, "reward": 0.4725000262260437, "reward_std": 0.5602477788925171, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.5191407799720764, "sampling/importance_sampling_ratio/max": 1.4709969758987427, "sampling/importance_sampling_ratio/mean": 0.9203107357025146, "sampling/importance_sampling_ratio/min": 0.38464194536209106, "sampling/sampling_logp_difference/max": 0.44011521339416504, "sampling/sampling_logp_difference/mean": 0.019846128299832344, "step": 104, "step_time": 63.931227530993056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3297388553619385, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 1.1884028911590576, "kl": 0.0026048035360872746, "learning_rate": 4.928542159879386e-06, "loss": 0.0376, "num_tokens": 586118.0, "reward": 0.46875, "reward_std": 0.5946630239486694, "rewards/reward_func/mean": 0.46875, "rewards/reward_func/std": 0.550803005695343, "sampling/importance_sampling_ratio/max": 1.9977771043777466, "sampling/importance_sampling_ratio/mean": 0.9485726952552795, "sampling/importance_sampling_ratio/min": 0.21913643181324005, "sampling/sampling_logp_difference/max": 0.32509803771972656, "sampling/sampling_logp_difference/mean": 0.021720722317695618, "step": 105, "step_time": 83.04879019001964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35155361890792847, "epoch": 0.212, "frac_reward_zero_std": 0.0, "grad_norm": 1.0504732131958008, "kl": 0.004135879687964916, "learning_rate": 4.926607386564898e-06, "loss": 0.1553, "num_tokens": 591400.0, "reward": -0.054999999701976776, "reward_std": 0.04559952765703201, "rewards/reward_func/mean": -0.054999999701976776, "rewards/reward_func/std": 0.05903993919491768, "sampling/importance_sampling_ratio/max": 1.2951682806015015, "sampling/importance_sampling_ratio/mean": 0.6690744757652283, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4716770648956299, "sampling/sampling_logp_difference/mean": 0.02485671453177929, "step": 106, "step_time": 82.7186574760126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.34445077180862427, "epoch": 0.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.5816599726676941, "kl": 0.002885550959035754, "learning_rate": 4.924647159287176e-06, "loss": 0.0717, "num_tokens": 596902.0, "reward": 0.3525000214576721, "reward_std": 0.2688867449760437, "rewards/reward_func/mean": 0.3525000214576721, "rewards/reward_func/std": 0.5281977653503418, "sampling/importance_sampling_ratio/max": 1.3920848369598389, "sampling/importance_sampling_ratio/mean": 0.6316713094711304, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5735256671905518, "sampling/sampling_logp_difference/mean": 0.024873752146959305, "step": 107, "step_time": 67.83759238000493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.34955301880836487, "epoch": 0.216, "frac_reward_zero_std": 0.0, "grad_norm": 1.4328964948654175, "kl": 0.001958028180524707, "learning_rate": 4.922661498608077e-06, "loss": 0.0188, "num_tokens": 602106.0, "reward": 0.4675000309944153, "reward_std": 0.5285453796386719, "rewards/reward_func/mean": 0.4675000309944153, "rewards/reward_func/std": 0.5641112327575684, "sampling/importance_sampling_ratio/max": 1.7434935569763184, "sampling/importance_sampling_ratio/mean": 1.167940616607666, "sampling/importance_sampling_ratio/min": 0.30112484097480774, "sampling/sampling_logp_difference/max": 0.42384326457977295, "sampling/sampling_logp_difference/mean": 0.023688288405537605, "step": 108, "step_time": 72.5184516950103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3498673439025879, "epoch": 0.218, "frac_reward_zero_std": 0.0, "grad_norm": 1.5927255153656006, "kl": 0.0023621944710612297, "learning_rate": 4.920650425356239e-06, "loss": -0.2113, "num_tokens": 607347.0, "reward": -0.0625, "reward_std": 0.05268768593668938, "rewards/reward_func/mean": -0.0625, "rewards/reward_func/std": 0.04978525638580322, "sampling/importance_sampling_ratio/max": 1.686551809310913, "sampling/importance_sampling_ratio/mean": 1.2102283239364624, "sampling/importance_sampling_ratio/min": 0.6830826997756958, "sampling/sampling_logp_difference/max": 0.3530259132385254, "sampling/sampling_logp_difference/mean": 0.02098490670323372, "step": 109, "step_time": 86.84248229800141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.35680192708969116, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.8017518520355225, "kl": 0.0017795683816075325, "learning_rate": 4.9186139606268735e-06, "loss": -0.0156, "num_tokens": 612704.0, "reward": 0.07250000536441803, "reward_std": 0.2962879240512848, "rewards/reward_func/mean": 0.07250000536441803, "rewards/reward_func/std": 0.3792756497859955, "sampling/importance_sampling_ratio/max": 1.3156285285949707, "sampling/importance_sampling_ratio/mean": 0.9082809686660767, "sampling/importance_sampling_ratio/min": 0.624051570892334, "sampling/sampling_logp_difference/max": 0.40149879455566406, "sampling/sampling_logp_difference/mean": 0.024703415110707283, "step": 110, "step_time": 93.84956424098345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.32336217164993286, "epoch": 0.222, "frac_reward_zero_std": 0.0, "grad_norm": 1.3528348207473755, "kl": 0.002949443645775318, "learning_rate": 4.916552125781529e-06, "loss": -0.0099, "num_tokens": 618399.0, "reward": 0.45625001192092896, "reward_std": 0.6083469390869141, "rewards/reward_func/mean": 0.45625001192092896, "rewards/reward_func/std": 0.5633810758590698, "sampling/importance_sampling_ratio/max": 1.8164138793945312, "sampling/importance_sampling_ratio/mean": 1.0095850229263306, "sampling/importance_sampling_ratio/min": 0.5931808352470398, "sampling/sampling_logp_difference/max": 0.5142123699188232, "sampling/sampling_logp_difference/mean": 0.02279416099190712, "step": 111, "step_time": 79.4126727580151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.30478546023368835, "epoch": 0.224, "frac_reward_zero_std": 0.0, "grad_norm": 1.1281598806381226, "kl": 0.0069794668816030025, "learning_rate": 4.9144649424478765e-06, "loss": -0.0498, "num_tokens": 623892.0, "reward": 0.08249999582767487, "reward_std": 0.2825864851474762, "rewards/reward_func/mean": 0.08249999582767487, "rewards/reward_func/std": 0.37247246503829956, "sampling/importance_sampling_ratio/max": 1.6144704818725586, "sampling/importance_sampling_ratio/mean": 0.8535523414611816, "sampling/importance_sampling_ratio/min": 0.48705849051475525, "sampling/sampling_logp_difference/max": 0.6082849502563477, "sampling/sampling_logp_difference/mean": 0.02124343067407608, "step": 112, "step_time": 95.65409678898868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3653002679347992, "epoch": 0.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.7723777294158936, "kl": 0.008705868385732174, "learning_rate": 4.912352432519484e-06, "loss": -0.0089, "num_tokens": 629286.0, "reward": 0.0650000050663948, "reward_std": 0.2788448631763458, "rewards/reward_func/mean": 0.0650000050663948, "rewards/reward_func/std": 0.36924636363983154, "sampling/importance_sampling_ratio/max": 1.065728783607483, "sampling/importance_sampling_ratio/mean": 0.8029188513755798, "sampling/importance_sampling_ratio/min": 0.6283921003341675, "sampling/sampling_logp_difference/max": 0.4079105854034424, "sampling/sampling_logp_difference/mean": 0.02258678898215294, "step": 113, "step_time": 79.44450050298474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3991454541683197, "epoch": 0.228, "frac_reward_zero_std": 0.0, "grad_norm": 1.2725372314453125, "kl": 0.004002253524959087, "learning_rate": 4.910214618155579e-06, "loss": -0.3522, "num_tokens": 635091.0, "reward": 0.3412500023841858, "reward_std": 0.5658435821533203, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.5457481741905212, "sampling/importance_sampling_ratio/max": 1.7948518991470337, "sampling/importance_sampling_ratio/mean": 1.1463571786880493, "sampling/importance_sampling_ratio/min": 0.5749549865722656, "sampling/sampling_logp_difference/max": 0.36228108406066895, "sampling/sampling_logp_difference/mean": 0.024780135601758957, "step": 114, "step_time": 75.48941349799861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3554052710533142, "epoch": 0.23, "frac_reward_zero_std": 0.0, "grad_norm": 1.4149909019470215, "kl": 0.00297270598821342, "learning_rate": 4.908051521780824e-06, "loss": -0.0461, "num_tokens": 641015.0, "reward": 0.21000000834465027, "reward_std": 0.5283524990081787, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.4891683757305145, "sampling/importance_sampling_ratio/max": 1.816857933998108, "sampling/importance_sampling_ratio/mean": 1.0504989624023438, "sampling/importance_sampling_ratio/min": 0.6830813884735107, "sampling/sampling_logp_difference/max": 0.25256574153900146, "sampling/sampling_logp_difference/mean": 0.017991136759519577, "step": 115, "step_time": 79.2266922009876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3142127990722656, "epoch": 0.232, "frac_reward_zero_std": 0.0, "grad_norm": 1.1826856136322021, "kl": 0.005817875266075134, "learning_rate": 4.905863166085076e-06, "loss": -0.3008, "num_tokens": 646381.0, "reward": 0.33250001072883606, "reward_std": 0.5690850019454956, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.5483677387237549, "sampling/importance_sampling_ratio/max": 1.5298662185668945, "sampling/importance_sampling_ratio/mean": 0.9026192426681519, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40772104263305664, "sampling/sampling_logp_difference/mean": 0.021547261625528336, "step": 116, "step_time": 83.28613287501503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.37694644927978516, "epoch": 0.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.9839865565299988, "kl": 0.002937185112386942, "learning_rate": 4.903649574023151e-06, "loss": -0.0315, "num_tokens": 652897.0, "reward": 0.20125000178813934, "reward_std": 0.31244680285453796, "rewards/reward_func/mean": 0.20125000178813934, "rewards/reward_func/std": 0.4629852771759033, "sampling/importance_sampling_ratio/max": 1.3970638513565063, "sampling/importance_sampling_ratio/mean": 0.8803737163543701, "sampling/importance_sampling_ratio/min": 0.568600058555603, "sampling/sampling_logp_difference/max": 0.3313124179840088, "sampling/sampling_logp_difference/mean": 0.02077941596508026, "step": 117, "step_time": 92.2992887319997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3316619396209717, "epoch": 0.236, "frac_reward_zero_std": 0.0, "grad_norm": 1.2543243169784546, "kl": 0.0029075820930302143, "learning_rate": 4.901410768814581e-06, "loss": 0.3369, "num_tokens": 659068.0, "reward": 0.06875000149011612, "reward_std": 0.28412488102912903, "rewards/reward_func/mean": 0.06875000149011612, "rewards/reward_func/std": 0.36701256036758423, "sampling/importance_sampling_ratio/max": 1.9490493535995483, "sampling/importance_sampling_ratio/mean": 1.230026125907898, "sampling/importance_sampling_ratio/min": 0.5826879739761353, "sampling/sampling_logp_difference/max": 0.3663163185119629, "sampling/sampling_logp_difference/mean": 0.02376718446612358, "step": 118, "step_time": 81.04622099900735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3243013620376587, "epoch": 0.238, "frac_reward_zero_std": 0.0, "grad_norm": 1.0776394605636597, "kl": 0.003003204707056284, "learning_rate": 4.899146773943374e-06, "loss": 0.0888, "num_tokens": 664052.0, "reward": 0.45374998450279236, "reward_std": 0.5213634967803955, "rewards/reward_func/mean": 0.45374998450279236, "rewards/reward_func/std": 0.555361807346344, "sampling/importance_sampling_ratio/max": 1.163706660270691, "sampling/importance_sampling_ratio/mean": 0.9491457939147949, "sampling/importance_sampling_ratio/min": 0.7341592311859131, "sampling/sampling_logp_difference/max": 0.3309454917907715, "sampling/sampling_logp_difference/mean": 0.021823348477482796, "step": 119, "step_time": 41.72294841398252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.36999672651290894, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 1.41775381565094, "kl": 0.002642344683408737, "learning_rate": 4.896857613157765e-06, "loss": -0.1831, "num_tokens": 669769.0, "reward": 0.06624999642372131, "reward_std": 0.2767047882080078, "rewards/reward_func/mean": 0.06624999642372131, "rewards/reward_func/std": 0.37159648537635803, "sampling/importance_sampling_ratio/max": 1.3718173503875732, "sampling/importance_sampling_ratio/mean": 0.9549704194068909, "sampling/importance_sampling_ratio/min": 0.5532563924789429, "sampling/sampling_logp_difference/max": 0.5324568748474121, "sampling/sampling_logp_difference/mean": 0.02652132511138916, "step": 120, "step_time": 92.35115976299858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.35806554555892944, "epoch": 0.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.6040545105934143, "kl": 0.0024742307141423225, "learning_rate": 4.894543310469968e-06, "loss": -0.0793, "num_tokens": 675165.0, "reward": 0.06000000610947609, "reward_std": 0.2803305685520172, "rewards/reward_func/mean": 0.06000000610947609, "rewards/reward_func/std": 0.3688979744911194, "sampling/importance_sampling_ratio/max": 2.129168748855591, "sampling/importance_sampling_ratio/mean": 0.9699528813362122, "sampling/importance_sampling_ratio/min": 0.44781070947647095, "sampling/sampling_logp_difference/max": 0.2890472412109375, "sampling/sampling_logp_difference/mean": 0.022462455555796623, "step": 121, "step_time": 78.31796040100744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3762264847755432, "epoch": 0.244, "frac_reward_zero_std": 0.0, "grad_norm": 1.0539501905441284, "kl": 0.005846098996698856, "learning_rate": 4.8922038901559225e-06, "loss": 0.0026, "num_tokens": 681091.0, "reward": 0.2162500023841858, "reward_std": 0.5134057998657227, "rewards/reward_func/mean": 0.2162500023841858, "rewards/reward_func/std": 0.47575318813323975, "sampling/importance_sampling_ratio/max": 1.4815739393234253, "sampling/importance_sampling_ratio/mean": 0.9299391508102417, "sampling/importance_sampling_ratio/min": 0.5258536338806152, "sampling/sampling_logp_difference/max": 0.30264854431152344, "sampling/sampling_logp_difference/mean": 0.020659077912569046, "step": 122, "step_time": 82.64369376702234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35361769795417786, "epoch": 0.246, "frac_reward_zero_std": 0.0, "grad_norm": 1.1598920822143555, "kl": 0.003386750351637602, "learning_rate": 4.889839376755041e-06, "loss": 0.0283, "num_tokens": 687305.0, "reward": 0.3087500035762787, "reward_std": 0.5727449655532837, "rewards/reward_func/mean": 0.3087500035762787, "rewards/reward_func/std": 0.5458528995513916, "sampling/importance_sampling_ratio/max": 1.7175296545028687, "sampling/importance_sampling_ratio/mean": 1.1679165363311768, "sampling/importance_sampling_ratio/min": 0.6514824628829956, "sampling/sampling_logp_difference/max": 0.47839367389678955, "sampling/sampling_logp_difference/mean": 0.023937463760375977, "step": 123, "step_time": 90.77700008201646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.35109275579452515, "epoch": 0.248, "frac_reward_zero_std": 0.0, "grad_norm": 1.0874840021133423, "kl": 0.0036450112238526344, "learning_rate": 4.887449795069948e-06, "loss": 0.1575, "num_tokens": 693333.0, "reward": 0.4699999988079071, "reward_std": 0.02523816004395485, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5484002232551575, "sampling/importance_sampling_ratio/max": 2.278353214263916, "sampling/importance_sampling_ratio/mean": 1.207700490951538, "sampling/importance_sampling_ratio/min": 0.5298588871955872, "sampling/sampling_logp_difference/max": 0.27920615673065186, "sampling/sampling_logp_difference/mean": 0.023498259484767914, "step": 124, "step_time": 39.61502477500471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.35430189967155457, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 2.6847572326660156, "kl": 0.004919369705021381, "learning_rate": 4.885035170166229e-06, "loss": -0.2698, "num_tokens": 698906.0, "reward": 0.0949999988079071, "reward_std": 0.2718702256679535, "rewards/reward_func/mean": 0.0949999988079071, "rewards/reward_func/std": 0.36629417538642883, "sampling/importance_sampling_ratio/max": 1.845292329788208, "sampling/importance_sampling_ratio/mean": 1.1654714345932007, "sampling/importance_sampling_ratio/min": 0.6519782543182373, "sampling/sampling_logp_difference/max": 0.4780464172363281, "sampling/sampling_logp_difference/mean": 0.0236376766115427, "step": 125, "step_time": 84.81572797399713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.37540292739868164, "epoch": 0.252, "frac_reward_zero_std": 0.0, "grad_norm": 1.1251049041748047, "kl": 0.004228860605508089, "learning_rate": 4.8825955273721524e-06, "loss": -0.1821, "num_tokens": 704535.0, "reward": 0.19999998807907104, "reward_std": 0.5351865291595459, "rewards/reward_func/mean": 0.19999998807907104, "rewards/reward_func/std": 0.49549400806427, "sampling/importance_sampling_ratio/max": 1.2624843120574951, "sampling/importance_sampling_ratio/mean": 0.850989818572998, "sampling/importance_sampling_ratio/min": 0.30390864610671997, "sampling/sampling_logp_difference/max": 0.3254203796386719, "sampling/sampling_logp_difference/mean": 0.022064577788114548, "step": 126, "step_time": 76.2562780379958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.41910192370414734, "epoch": 0.254, "frac_reward_zero_std": 0.0, "grad_norm": 1.317619800567627, "kl": 0.005742704961448908, "learning_rate": 4.88013089227842e-06, "loss": 0.0236, "num_tokens": 709703.0, "reward": 0.4424999952316284, "reward_std": 0.5581981539726257, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.5799199342727661, "sampling/importance_sampling_ratio/max": 1.348827838897705, "sampling/importance_sampling_ratio/mean": 0.9275149703025818, "sampling/importance_sampling_ratio/min": 0.33642151951789856, "sampling/sampling_logp_difference/max": 0.3352065086364746, "sampling/sampling_logp_difference/mean": 0.027083944529294968, "step": 127, "step_time": 63.868688657006714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3978528380393982, "epoch": 0.256, "frac_reward_zero_std": 0.0, "grad_norm": 1.7018318176269531, "kl": 0.0047828396782279015, "learning_rate": 4.8776412907378845e-06, "loss": -0.1779, "num_tokens": 715735.0, "reward": 0.21000000834465027, "reward_std": 0.5070174336433411, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.46940696239471436, "sampling/importance_sampling_ratio/max": 1.5801129341125488, "sampling/importance_sampling_ratio/mean": 0.823201060295105, "sampling/importance_sampling_ratio/min": 0.35874059796333313, "sampling/sampling_logp_difference/max": 0.6062784194946289, "sampling/sampling_logp_difference/mean": 0.02410537376999855, "step": 128, "step_time": 83.61712833400816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.36482003331184387, "epoch": 0.258, "frac_reward_zero_std": 0.0, "grad_norm": 1.880913496017456, "kl": 0.005393403582274914, "learning_rate": 4.87512674886529e-06, "loss": 0.1742, "num_tokens": 720958.0, "reward": 0.20374999940395355, "reward_std": 0.311365008354187, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.46601465344429016, "sampling/importance_sampling_ratio/max": 2.4826807975769043, "sampling/importance_sampling_ratio/mean": 1.167348861694336, "sampling/importance_sampling_ratio/min": 0.26117852330207825, "sampling/sampling_logp_difference/max": 0.8425577878952026, "sampling/sampling_logp_difference/mean": 0.025742068886756897, "step": 129, "step_time": 64.07934993301751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.371246337890625, "epoch": 0.26, "frac_reward_zero_std": 0.0, "grad_norm": 1.614325761795044, "kl": 0.0035083587281405926, "learning_rate": 4.872587293036991e-06, "loss": -0.2087, "num_tokens": 727108.0, "reward": 0.29375001788139343, "reward_std": 0.562328040599823, "rewards/reward_func/mean": 0.29375001788139343, "rewards/reward_func/std": 0.537532389163971, "sampling/importance_sampling_ratio/max": 1.6865801811218262, "sampling/importance_sampling_ratio/mean": 1.0800793170928955, "sampling/importance_sampling_ratio/min": 0.5898501873016357, "sampling/sampling_logp_difference/max": 0.3827958106994629, "sampling/sampling_logp_difference/mean": 0.024071460589766502, "step": 130, "step_time": 78.63163189700572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.32340848445892334, "epoch": 0.262, "frac_reward_zero_std": 0.0, "grad_norm": 1.304259181022644, "kl": 0.004556507803499699, "learning_rate": 4.870022949890676e-06, "loss": 0.207, "num_tokens": 733001.0, "reward": 0.19374999403953552, "reward_std": 0.535616397857666, "rewards/reward_func/mean": 0.19374999403953552, "rewards/reward_func/std": 0.495895653963089, "sampling/importance_sampling_ratio/max": 2.3520002365112305, "sampling/importance_sampling_ratio/mean": 1.1608736515045166, "sampling/importance_sampling_ratio/min": 0.5538614988327026, "sampling/sampling_logp_difference/max": 0.394594669342041, "sampling/sampling_logp_difference/mean": 0.0221773199737072, "step": 131, "step_time": 92.43735384300817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3576907515525818, "epoch": 0.264, "frac_reward_zero_std": 0.0, "grad_norm": 2.0428054332733154, "kl": 0.018777361139655113, "learning_rate": 4.867433746325093e-06, "loss": -0.0642, "num_tokens": 739105.0, "reward": 0.21875, "reward_std": 0.5183683633804321, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.4804295003414154, "sampling/importance_sampling_ratio/max": 1.5391448736190796, "sampling/importance_sampling_ratio/mean": 0.9020660519599915, "sampling/importance_sampling_ratio/min": 0.2766675651073456, "sampling/sampling_logp_difference/max": 1.2109692096710205, "sampling/sampling_logp_difference/mean": 0.033887773752212524, "step": 132, "step_time": 87.67151060700417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.33072763681411743, "epoch": 0.266, "frac_reward_zero_std": 0.0, "grad_norm": 1.5945179462432861, "kl": 0.0026382263749837875, "learning_rate": 4.864819709499762e-06, "loss": -0.0036, "num_tokens": 744440.0, "reward": 0.5874999761581421, "reward_std": 0.5714925527572632, "rewards/reward_func/mean": 0.5874999761581421, "rewards/reward_func/std": 0.5562823414802551, "sampling/importance_sampling_ratio/max": 2.234254837036133, "sampling/importance_sampling_ratio/mean": 1.3247432708740234, "sampling/importance_sampling_ratio/min": 0.6224689483642578, "sampling/sampling_logp_difference/max": 0.3226501941680908, "sampling/sampling_logp_difference/mean": 0.021056218072772026, "step": 133, "step_time": 55.756049841002095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3642032742500305, "epoch": 0.268, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390766859054565, "kl": 0.0025580194778740406, "learning_rate": 4.862180866834691e-06, "loss": -0.2011, "num_tokens": 750197.0, "reward": 0.3425000011920929, "reward_std": 0.5499535799026489, "rewards/reward_func/mean": 0.3425000011920929, "rewards/reward_func/std": 0.5348631739616394, "sampling/importance_sampling_ratio/max": 2.091257333755493, "sampling/importance_sampling_ratio/mean": 0.9505029916763306, "sampling/importance_sampling_ratio/min": 0.39569008350372314, "sampling/sampling_logp_difference/max": 0.5128500461578369, "sampling/sampling_logp_difference/mean": 0.022797472774982452, "step": 134, "step_time": 77.64818813299644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35776978731155396, "epoch": 0.27, "frac_reward_zero_std": 0.0, "grad_norm": 1.332118034362793, "kl": 0.0026144185103476048, "learning_rate": 4.8595172460100914e-06, "loss": -0.1461, "num_tokens": 755246.0, "reward": 0.20875000953674316, "reward_std": 0.3141333758831024, "rewards/reward_func/mean": 0.20875000953674316, "rewards/reward_func/std": 0.4824472665786743, "sampling/importance_sampling_ratio/max": 2.2898991107940674, "sampling/importance_sampling_ratio/mean": 1.1793955564498901, "sampling/importance_sampling_ratio/min": 0.3208160698413849, "sampling/sampling_logp_difference/max": 0.32736682891845703, "sampling/sampling_logp_difference/mean": 0.025175008922815323, "step": 135, "step_time": 94.36543551200884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.326175332069397, "epoch": 0.272, "frac_reward_zero_std": 0.0, "grad_norm": 1.042453408241272, "kl": 0.002404378727078438, "learning_rate": 4.856828874966086e-06, "loss": 0.0913, "num_tokens": 761403.0, "reward": 0.08624999970197678, "reward_std": 0.24735195934772491, "rewards/reward_func/mean": 0.08624999970197678, "rewards/reward_func/std": 0.33019205927848816, "sampling/importance_sampling_ratio/max": 1.598332405090332, "sampling/importance_sampling_ratio/mean": 1.1521015167236328, "sampling/importance_sampling_ratio/min": 0.8144843578338623, "sampling/sampling_logp_difference/max": 0.30525922775268555, "sampling/sampling_logp_difference/mean": 0.02139485627412796, "step": 136, "step_time": 89.18075294501614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.32590705156326294, "epoch": 0.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.6651734709739685, "kl": 0.00382244773209095, "learning_rate": 4.854115781902414e-06, "loss": 0.0387, "num_tokens": 767277.0, "reward": 0.3462499976158142, "reward_std": 0.5454127788543701, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5253825187683105, "sampling/importance_sampling_ratio/max": 1.1137996912002563, "sampling/importance_sampling_ratio/mean": 0.6871470808982849, "sampling/importance_sampling_ratio/min": 0.2649921774864197, "sampling/sampling_logp_difference/max": 0.43839168548583984, "sampling/sampling_logp_difference/mean": 0.024401474744081497, "step": 137, "step_time": 63.591420451994054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3442898988723755, "epoch": 0.276, "frac_reward_zero_std": 0.0, "grad_norm": 1.3569279909133911, "kl": 0.006373442709445953, "learning_rate": 4.851377995278138e-06, "loss": 0.0322, "num_tokens": 772862.0, "reward": 0.34375, "reward_std": 0.28818657994270325, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5449754595756531, "sampling/importance_sampling_ratio/max": 2.1303188800811768, "sampling/importance_sampling_ratio/mean": 1.062652587890625, "sampling/importance_sampling_ratio/min": 0.4433048367500305, "sampling/sampling_logp_difference/max": 0.5312175750732422, "sampling/sampling_logp_difference/mean": 0.027195535600185394, "step": 138, "step_time": 79.88628011097899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.33581745624542236, "epoch": 0.278, "frac_reward_zero_std": 0.0, "grad_norm": 0.8458130955696106, "kl": 0.003910760395228863, "learning_rate": 4.8486155438113455e-06, "loss": 0.1521, "num_tokens": 778552.0, "reward": 0.07750000059604645, "reward_std": 0.27364206314086914, "rewards/reward_func/mean": 0.07750000059604645, "rewards/reward_func/std": 0.3691205680370331, "sampling/importance_sampling_ratio/max": 1.5847712755203247, "sampling/importance_sampling_ratio/mean": 0.9813762307167053, "sampling/importance_sampling_ratio/min": 0.5472065806388855, "sampling/sampling_logp_difference/max": 0.3161931037902832, "sampling/sampling_logp_difference/mean": 0.017320292070508003, "step": 139, "step_time": 76.26634333998663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.875, "completions/mean_terminated_length": 45.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3359699845314026, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 1.083722710609436, "kl": 0.06697973608970642, "learning_rate": 4.845828456478843e-06, "loss": 0.2268, "num_tokens": 784051.0, "reward": 0.2162499874830246, "reward_std": 0.510195791721344, "rewards/reward_func/mean": 0.2162499874830246, "rewards/reward_func/std": 0.47307315468788147, "sampling/importance_sampling_ratio/max": 1.3066641092300415, "sampling/importance_sampling_ratio/mean": 0.8830969929695129, "sampling/importance_sampling_ratio/min": 0.5595781207084656, "sampling/sampling_logp_difference/max": 0.38854122161865234, "sampling/sampling_logp_difference/mean": 0.026553025469183922, "step": 140, "step_time": 69.88314274000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.31609684228897095, "epoch": 0.282, "frac_reward_zero_std": 0.0, "grad_norm": 2.061277389526367, "kl": 0.013994975946843624, "learning_rate": 4.84301676251586e-06, "loss": 0.3487, "num_tokens": 788975.0, "reward": -0.07000000029802322, "reward_std": 0.052655644714832306, "rewards/reward_func/mean": -0.07000000029802322, "rewards/reward_func/std": 0.051823876798152924, "sampling/importance_sampling_ratio/max": 2.0056557655334473, "sampling/importance_sampling_ratio/mean": 0.9070459008216858, "sampling/importance_sampling_ratio/min": 0.40230387449264526, "sampling/sampling_logp_difference/max": 0.49358463287353516, "sampling/sampling_logp_difference/mean": 0.023013930767774582, "step": 141, "step_time": 86.36220617999788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.39303815364837646, "epoch": 0.284, "frac_reward_zero_std": 0.0, "grad_norm": 1.3883771896362305, "kl": 0.0059036314487457275, "learning_rate": 4.840180491415733e-06, "loss": 0.0691, "num_tokens": 794046.0, "reward": 0.1899999976158142, "reward_std": 0.3205098509788513, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.48966461420059204, "sampling/importance_sampling_ratio/max": 1.5588135719299316, "sampling/importance_sampling_ratio/mean": 1.177175521850586, "sampling/importance_sampling_ratio/min": 0.6510878801345825, "sampling/sampling_logp_difference/max": 0.8279815912246704, "sampling/sampling_logp_difference/mean": 0.02602643519639969, "step": 142, "step_time": 74.00148760300362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3808709979057312, "epoch": 0.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.9528871774673462, "kl": 0.0036585263442248106, "learning_rate": 4.837319672929606e-06, "loss": 0.0533, "num_tokens": 800535.0, "reward": 0.023750003427267075, "reward_std": 0.3143449127674103, "rewards/reward_func/mean": 0.023750003427267075, "rewards/reward_func/std": 0.3961578905582428, "sampling/importance_sampling_ratio/max": 1.8512533903121948, "sampling/importance_sampling_ratio/mean": 0.934387743473053, "sampling/importance_sampling_ratio/min": 0.49656394124031067, "sampling/sampling_logp_difference/max": 0.3407723903656006, "sampling/sampling_logp_difference/mean": 0.02522529661655426, "step": 143, "step_time": 103.3366472539783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.33955660462379456, "epoch": 0.288, "frac_reward_zero_std": 0.0, "grad_norm": 1.3187702894210815, "kl": 0.003852253081277013, "learning_rate": 4.834434337066112e-06, "loss": -0.0678, "num_tokens": 806790.0, "reward": 0.4737499952316284, "reward_std": 0.6020057201385498, "rewards/reward_func/mean": 0.4737499952316284, "rewards/reward_func/std": 0.5573647022247314, "sampling/importance_sampling_ratio/max": 1.97614586353302, "sampling/importance_sampling_ratio/mean": 1.0368638038635254, "sampling/importance_sampling_ratio/min": 0.6063994765281677, "sampling/sampling_logp_difference/max": 0.3311631679534912, "sampling/sampling_logp_difference/mean": 0.02065013162791729, "step": 144, "step_time": 65.69823711100616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.331376314163208, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.9537304043769836, "kl": 0.007181019987910986, "learning_rate": 4.831524514091056e-06, "loss": -0.0085, "num_tokens": 812242.0, "reward": 0.17624999582767487, "reward_std": 0.32842689752578735, "rewards/reward_func/mean": 0.17624999582767487, "rewards/reward_func/std": 0.47101524472236633, "sampling/importance_sampling_ratio/max": 1.308451533317566, "sampling/importance_sampling_ratio/mean": 0.9442053437232971, "sampling/importance_sampling_ratio/min": 0.579285740852356, "sampling/sampling_logp_difference/max": 0.34829843044281006, "sampling/sampling_logp_difference/mean": 0.017970219254493713, "step": 145, "step_time": 79.94236772001022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.36558613181114197, "epoch": 0.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.937701404094696, "kl": 0.006811534054577351, "learning_rate": 4.828590234527107e-06, "loss": -0.0287, "num_tokens": 817572.0, "reward": 0.4725000262260437, "reward_std": 0.48875167965888977, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.5325075387954712, "sampling/importance_sampling_ratio/max": 1.5232332944869995, "sampling/importance_sampling_ratio/mean": 0.8906862735748291, "sampling/importance_sampling_ratio/min": 0.5748233199119568, "sampling/sampling_logp_difference/max": 0.3986041247844696, "sampling/sampling_logp_difference/mean": 0.022225454449653625, "step": 146, "step_time": 47.61015773200779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34926676750183105, "epoch": 0.294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4631636142730713, "kl": 0.005530308000743389, "learning_rate": 4.825631529153466e-06, "loss": 0.0385, "num_tokens": 823067.0, "reward": -0.05374999716877937, "reward_std": 0.0510866716504097, "rewards/reward_func/mean": -0.05374999716877937, "rewards/reward_func/std": 0.05655276030302048, "sampling/importance_sampling_ratio/max": 1.7795029878616333, "sampling/importance_sampling_ratio/mean": 1.1954052448272705, "sampling/importance_sampling_ratio/min": 0.6682614088058472, "sampling/sampling_logp_difference/max": 0.4499216079711914, "sampling/sampling_logp_difference/mean": 0.021715868264436722, "step": 147, "step_time": 97.10051084400038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.34877437353134155, "epoch": 0.296, "frac_reward_zero_std": 0.0, "grad_norm": 1.521934986114502, "kl": 0.005301266442984343, "learning_rate": 4.8226484290055544e-06, "loss": 0.0887, "num_tokens": 828804.0, "reward": 0.48374998569488525, "reward_std": 0.5961781144142151, "rewards/reward_func/mean": 0.48374998569488525, "rewards/reward_func/std": 0.5521112680435181, "sampling/importance_sampling_ratio/max": 1.8339399099349976, "sampling/importance_sampling_ratio/mean": 1.0161359310150146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7577025890350342, "sampling/sampling_logp_difference/mean": 0.025348538532853127, "step": 148, "step_time": 61.746048774017254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3230116367340088, "epoch": 0.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.9116004705429077, "kl": 0.011923927813768387, "learning_rate": 4.8196409653746815e-06, "loss": 0.0086, "num_tokens": 834368.0, "reward": 0.05874999612569809, "reward_std": 0.28938817977905273, "rewards/reward_func/mean": 0.05874999612569809, "rewards/reward_func/std": 0.38327306509017944, "sampling/importance_sampling_ratio/max": 1.5544939041137695, "sampling/importance_sampling_ratio/mean": 0.9016103148460388, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2695889472961426, "sampling/sampling_logp_difference/mean": 0.02005579136312008, "step": 149, "step_time": 93.92776288898312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.875, "completions/mean_terminated_length": 46.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.31266871094703674, "epoch": 0.3, "frac_reward_zero_std": 0.0, "grad_norm": 1.0015181303024292, "kl": 0.003962170798331499, "learning_rate": 4.8166091698077165e-06, "loss": -0.0324, "num_tokens": 839246.0, "reward": 0.22374999523162842, "reward_std": 0.5151989459991455, "rewards/reward_func/mean": 0.22374999523162842, "rewards/reward_func/std": 0.47782060503959656, "sampling/importance_sampling_ratio/max": 1.158785343170166, "sampling/importance_sampling_ratio/mean": 0.86640864610672, "sampling/importance_sampling_ratio/min": 0.6415550112724304, "sampling/sampling_logp_difference/max": 0.31314682960510254, "sampling/sampling_logp_difference/mean": 0.018176782876253128, "step": 150, "step_time": 90.2421096219914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.28932589292526245, "epoch": 0.302, "frac_reward_zero_std": 0.0, "grad_norm": 2.364370584487915, "kl": 0.00638082530349493, "learning_rate": 4.813553074106761e-06, "loss": -0.166, "num_tokens": 844201.0, "reward": 0.3137499988079071, "reward_std": 0.5894033312797546, "rewards/reward_func/mean": 0.3137499988079071, "rewards/reward_func/std": 0.5638119578361511, "sampling/importance_sampling_ratio/max": 2.0728578567504883, "sampling/importance_sampling_ratio/mean": 1.2075482606887817, "sampling/importance_sampling_ratio/min": 0.5291450023651123, "sampling/sampling_logp_difference/max": 0.6335185766220093, "sampling/sampling_logp_difference/mean": 0.022180885076522827, "step": 151, "step_time": 77.12891793900053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.39411360025405884, "epoch": 0.304, "frac_reward_zero_std": 0.0, "grad_norm": 1.070342779159546, "kl": 0.006427218206226826, "learning_rate": 4.8104727103288125e-06, "loss": 0.123, "num_tokens": 850002.0, "reward": -0.03125, "reward_std": 0.026997683569788933, "rewards/reward_func/mean": -0.03125, "rewards/reward_func/std": 0.025319388136267662, "sampling/importance_sampling_ratio/max": 1.8757935762405396, "sampling/importance_sampling_ratio/mean": 1.1552492380142212, "sampling/importance_sampling_ratio/min": 0.589501678943634, "sampling/sampling_logp_difference/max": 0.4456930160522461, "sampling/sampling_logp_difference/mean": 0.027546117082238197, "step": 152, "step_time": 90.55313208300504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.36300113797187805, "epoch": 0.306, "frac_reward_zero_std": 0.0, "grad_norm": 1.0927797555923462, "kl": 0.005236159078776836, "learning_rate": 4.80736811078543e-06, "loss": -0.1923, "num_tokens": 855547.0, "reward": 0.33000001311302185, "reward_std": 0.560116171836853, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.538993775844574, "sampling/importance_sampling_ratio/max": 1.9103295803070068, "sampling/importance_sampling_ratio/mean": 1.0825953483581543, "sampling/importance_sampling_ratio/min": 0.6134956479072571, "sampling/sampling_logp_difference/max": 0.306821346282959, "sampling/sampling_logp_difference/mean": 0.018912725150585175, "step": 153, "step_time": 79.64092588599306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3884934186935425, "epoch": 0.308, "frac_reward_zero_std": 0.0, "grad_norm": 1.1354352235794067, "kl": 0.00976630486547947, "learning_rate": 4.804239308042392e-06, "loss": 0.0839, "num_tokens": 861032.0, "reward": 0.4975000023841858, "reward_std": 0.5802370309829712, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5372084379196167, "sampling/importance_sampling_ratio/max": 1.285419225692749, "sampling/importance_sampling_ratio/mean": 0.8535559177398682, "sampling/importance_sampling_ratio/min": 0.2762400805950165, "sampling/sampling_logp_difference/max": 0.7990829944610596, "sampling/sampling_logp_difference/mean": 0.027860336005687714, "step": 154, "step_time": 65.1173454009986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3430503308773041, "epoch": 0.31, "frac_reward_zero_std": 0.0, "grad_norm": 1.299552083015442, "kl": 0.005469894502311945, "learning_rate": 4.8010863349193605e-06, "loss": -0.1091, "num_tokens": 866546.0, "reward": 0.46000000834465027, "reward_std": 0.5039982199668884, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.5573149919509888, "sampling/importance_sampling_ratio/max": 1.5915218591690063, "sampling/importance_sampling_ratio/mean": 1.0774503946304321, "sampling/importance_sampling_ratio/min": 0.6246634721755981, "sampling/sampling_logp_difference/max": 0.32491254806518555, "sampling/sampling_logp_difference/mean": 0.021029043942689896, "step": 155, "step_time": 62.39225424500182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3359929323196411, "epoch": 0.312, "frac_reward_zero_std": 0.0, "grad_norm": 1.646360158920288, "kl": 0.013532448559999466, "learning_rate": 4.797909224489531e-06, "loss": 0.0599, "num_tokens": 872235.0, "reward": 0.08875000476837158, "reward_std": 0.27286672592163086, "rewards/reward_func/mean": 0.08875000476837158, "rewards/reward_func/std": 0.365764856338501, "sampling/importance_sampling_ratio/max": 1.6614181995391846, "sampling/importance_sampling_ratio/mean": 0.9551100730895996, "sampling/importance_sampling_ratio/min": 0.45479723811149597, "sampling/sampling_logp_difference/max": 0.41816186904907227, "sampling/sampling_logp_difference/mean": 0.02349046617746353, "step": 156, "step_time": 86.28342253799201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.36201316118240356, "epoch": 0.314, "frac_reward_zero_std": 0.0, "grad_norm": 1.2846462726593018, "kl": 0.0050716521218419075, "learning_rate": 4.794708010079288e-06, "loss": 0.1633, "num_tokens": 878136.0, "reward": 0.3462499976158142, "reward_std": 0.5590543746948242, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5334774851799011, "sampling/importance_sampling_ratio/max": 1.8937947750091553, "sampling/importance_sampling_ratio/mean": 1.087288498878479, "sampling/importance_sampling_ratio/min": 0.5765236020088196, "sampling/sampling_logp_difference/max": 0.5770103931427002, "sampling/sampling_logp_difference/mean": 0.019394293427467346, "step": 157, "step_time": 77.44638174801366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.35876867175102234, "epoch": 0.316, "frac_reward_zero_std": 0.0, "grad_norm": 1.6277668476104736, "kl": 0.012355468235909939, "learning_rate": 4.791482725267858e-06, "loss": 0.0167, "num_tokens": 883346.0, "reward": 0.32499998807907104, "reward_std": 0.5781960487365723, "rewards/reward_func/mean": 0.32499998807907104, "rewards/reward_func/std": 0.5492072105407715, "sampling/importance_sampling_ratio/max": 1.5887843370437622, "sampling/importance_sampling_ratio/mean": 0.9820950627326965, "sampling/importance_sampling_ratio/min": 0.44207823276519775, "sampling/sampling_logp_difference/max": 0.6413552761077881, "sampling/sampling_logp_difference/mean": 0.023921802639961243, "step": 158, "step_time": 64.3381597440166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.32425469160079956, "epoch": 0.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.9400520324707031, "kl": 0.004790422506630421, "learning_rate": 4.78823340388695e-06, "loss": -0.117, "num_tokens": 889686.0, "reward": 0.2187499850988388, "reward_std": 0.31793859601020813, "rewards/reward_func/mean": 0.2187499850988388, "rewards/reward_func/std": 0.47139421105384827, "sampling/importance_sampling_ratio/max": 1.150166392326355, "sampling/importance_sampling_ratio/mean": 0.8458471894264221, "sampling/importance_sampling_ratio/min": 0.521885335445404, "sampling/sampling_logp_difference/max": 0.45719194412231445, "sampling/sampling_logp_difference/mean": 0.020912881940603256, "step": 159, "step_time": 83.63672647799831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.362440288066864, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 1.179018259048462, "kl": 0.009544030763208866, "learning_rate": 4.7849600800204075e-06, "loss": -0.0725, "num_tokens": 895303.0, "reward": 0.3062500059604645, "reward_std": 0.5794415473937988, "rewards/reward_func/mean": 0.3062500059604645, "rewards/reward_func/std": 0.5538163185119629, "sampling/importance_sampling_ratio/max": 1.3555833101272583, "sampling/importance_sampling_ratio/mean": 0.8662522435188293, "sampling/importance_sampling_ratio/min": 0.48955845832824707, "sampling/sampling_logp_difference/max": 0.3127005100250244, "sampling/sampling_logp_difference/mean": 0.02036750502884388, "step": 160, "step_time": 75.56624679401284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3294234275817871, "epoch": 0.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.9646108150482178, "kl": 0.008349942974746227, "learning_rate": 4.781662788003851e-06, "loss": 0.04, "num_tokens": 900212.0, "reward": 0.3125, "reward_std": 0.28340619802474976, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5329098105430603, "sampling/importance_sampling_ratio/max": 1.3511476516723633, "sampling/importance_sampling_ratio/mean": 0.9460296034812927, "sampling/importance_sampling_ratio/min": 0.5577508211135864, "sampling/sampling_logp_difference/max": 0.357053279876709, "sampling/sampling_logp_difference/mean": 0.02139732614159584, "step": 161, "step_time": 63.430384036997566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.37909457087516785, "epoch": 0.324, "frac_reward_zero_std": 0.0, "grad_norm": 1.1150704622268677, "kl": 0.006739302072674036, "learning_rate": 4.778341562424312e-06, "loss": -0.0058, "num_tokens": 905567.0, "reward": 0.05624999478459358, "reward_std": 0.30837467312812805, "rewards/reward_func/mean": 0.05624999478459358, "rewards/reward_func/std": 0.3843710124492645, "sampling/importance_sampling_ratio/max": 1.5031989812850952, "sampling/importance_sampling_ratio/mean": 0.9705907106399536, "sampling/importance_sampling_ratio/min": 0.4351043999195099, "sampling/sampling_logp_difference/max": 0.31818532943725586, "sampling/sampling_logp_difference/mean": 0.023320209234952927, "step": 162, "step_time": 88.88530365700717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3668830394744873, "epoch": 0.326, "frac_reward_zero_std": 0.0, "grad_norm": 1.6112715005874634, "kl": 0.00733697135001421, "learning_rate": 4.774996438119876e-06, "loss": -0.2199, "num_tokens": 910978.0, "reward": 0.4387499988079071, "reward_std": 0.6334177255630493, "rewards/reward_func/mean": 0.4387499988079071, "rewards/reward_func/std": 0.5865988731384277, "sampling/importance_sampling_ratio/max": 1.7501353025436401, "sampling/importance_sampling_ratio/mean": 1.1737459897994995, "sampling/importance_sampling_ratio/min": 0.5407097935676575, "sampling/sampling_logp_difference/max": 0.5975207090377808, "sampling/sampling_logp_difference/mean": 0.023949533700942993, "step": 163, "step_time": 77.05458755500149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3550381660461426, "epoch": 0.328, "frac_reward_zero_std": 0.0, "grad_norm": 1.4822602272033691, "kl": 0.018688620999455452, "learning_rate": 4.771627450179315e-06, "loss": 0.1415, "num_tokens": 916968.0, "reward": 0.07750000059604645, "reward_std": 0.2801649868488312, "rewards/reward_func/mean": 0.07750000059604645, "rewards/reward_func/std": 0.36939141154289246, "sampling/importance_sampling_ratio/max": 2.532761573791504, "sampling/importance_sampling_ratio/mean": 1.3982677459716797, "sampling/importance_sampling_ratio/min": 0.7517146468162537, "sampling/sampling_logp_difference/max": 0.45490455627441406, "sampling/sampling_logp_difference/mean": 0.020801950246095657, "step": 164, "step_time": 82.38432875502622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.31968969106674194, "epoch": 0.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.8617737293243408, "kl": 0.013258876278996468, "learning_rate": 4.768234633941716e-06, "loss": -0.1274, "num_tokens": 923326.0, "reward": 0.3487499952316284, "reward_std": 0.5436524152755737, "rewards/reward_func/mean": 0.3487499952316284, "rewards/reward_func/std": 0.5261297821998596, "sampling/importance_sampling_ratio/max": 1.1405854225158691, "sampling/importance_sampling_ratio/mean": 0.9033545255661011, "sampling/importance_sampling_ratio/min": 0.7151353359222412, "sampling/sampling_logp_difference/max": 0.31670236587524414, "sampling/sampling_logp_difference/mean": 0.020617477595806122, "step": 165, "step_time": 91.76991003201692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34689170122146606, "epoch": 0.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.9533897638320923, "kl": 0.005911373533308506, "learning_rate": 4.764818024996117e-06, "loss": 0.038, "num_tokens": 929389.0, "reward": 0.07500000298023224, "reward_std": 0.27509522438049316, "rewards/reward_func/mean": 0.07500000298023224, "rewards/reward_func/std": 0.3702123165130615, "sampling/importance_sampling_ratio/max": 1.1052998304367065, "sampling/importance_sampling_ratio/mean": 0.8497065305709839, "sampling/importance_sampling_ratio/min": 0.5713857412338257, "sampling/sampling_logp_difference/max": 0.2978546619415283, "sampling/sampling_logp_difference/mean": 0.019739100709557533, "step": 166, "step_time": 84.64203599700704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3531273603439331, "epoch": 0.334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5965367555618286, "kl": 0.017820192500948906, "learning_rate": 4.76137765918113e-06, "loss": -0.1656, "num_tokens": 934556.0, "reward": 0.20249998569488525, "reward_std": 0.5291311740875244, "rewards/reward_func/mean": 0.20249998569488525, "rewards/reward_func/std": 0.4898906648159027, "sampling/importance_sampling_ratio/max": 1.6110693216323853, "sampling/importance_sampling_ratio/mean": 1.1585469245910645, "sampling/importance_sampling_ratio/min": 0.7089804410934448, "sampling/sampling_logp_difference/max": 0.5340450406074524, "sampling/sampling_logp_difference/mean": 0.02586180344223976, "step": 167, "step_time": 66.61828825098928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.31788909435272217, "epoch": 0.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.9643736481666565, "kl": 0.008403636515140533, "learning_rate": 4.757913572584564e-06, "loss": 0.017, "num_tokens": 939873.0, "reward": 0.20375001430511475, "reward_std": 0.5219398736953735, "rewards/reward_func/mean": 0.20375001430511475, "rewards/reward_func/std": 0.4842354357242584, "sampling/importance_sampling_ratio/max": 1.196733832359314, "sampling/importance_sampling_ratio/mean": 0.8918753266334534, "sampling/importance_sampling_ratio/min": 0.45229190587997437, "sampling/sampling_logp_difference/max": 0.3446998596191406, "sampling/sampling_logp_difference/mean": 0.01955568790435791, "step": 168, "step_time": 73.99562716198852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.34987837076187134, "epoch": 0.338, "frac_reward_zero_std": 0.0, "grad_norm": 1.0800118446350098, "kl": 0.006376350298523903, "learning_rate": 4.754425801543047e-06, "loss": 0.0112, "num_tokens": 945824.0, "reward": 0.2175000011920929, "reward_std": 0.3063386380672455, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.477426141500473, "sampling/importance_sampling_ratio/max": 1.8290735483169556, "sampling/importance_sampling_ratio/mean": 1.1064127683639526, "sampling/importance_sampling_ratio/min": 0.43567004799842834, "sampling/sampling_logp_difference/max": 0.31137561798095703, "sampling/sampling_logp_difference/mean": 0.020785929635167122, "step": 169, "step_time": 93.93666018798831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.35052281618118286, "epoch": 0.34, "frac_reward_zero_std": 0.0, "grad_norm": 1.340181589126587, "kl": 0.0070198904722929, "learning_rate": 4.750914382641647e-06, "loss": -0.1266, "num_tokens": 951240.0, "reward": 0.3087499737739563, "reward_std": 0.2839585244655609, "rewards/reward_func/mean": 0.3087499737739563, "rewards/reward_func/std": 0.5617685317993164, "sampling/importance_sampling_ratio/max": 1.5249656438827515, "sampling/importance_sampling_ratio/mean": 0.9443210363388062, "sampling/importance_sampling_ratio/min": 0.6084120869636536, "sampling/sampling_logp_difference/max": 0.3037455081939697, "sampling/sampling_logp_difference/mean": 0.020245909690856934, "step": 170, "step_time": 85.42551145199104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 42.375, "completions/mean_terminated_length": 42.375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3007114827632904, "epoch": 0.342, "frac_reward_zero_std": 0.0, "grad_norm": 1.134536623954773, "kl": 0.011100707575678825, "learning_rate": 4.747379352713489e-06, "loss": -0.001, "num_tokens": 956957.0, "reward": 0.33124998211860657, "reward_std": 0.2721617817878723, "rewards/reward_func/mean": 0.33124998211860657, "rewards/reward_func/std": 0.5298096537590027, "sampling/importance_sampling_ratio/max": 1.7444802522659302, "sampling/importance_sampling_ratio/mean": 1.0147829055786133, "sampling/importance_sampling_ratio/min": 0.4858468472957611, "sampling/sampling_logp_difference/max": 0.3548402786254883, "sampling/sampling_logp_difference/mean": 0.024134717881679535, "step": 171, "step_time": 83.73731894500088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.36536312103271484, "epoch": 0.344, "frac_reward_zero_std": 0.0, "grad_norm": 2.420339345932007, "kl": 0.015561670064926147, "learning_rate": 4.743820748839362e-06, "loss": -0.1682, "num_tokens": 962384.0, "reward": 0.23499999940395355, "reward_std": 0.30095145106315613, "rewards/reward_func/mean": 0.23499999940395355, "rewards/reward_func/std": 0.46632298827171326, "sampling/importance_sampling_ratio/max": 2.9884486198425293, "sampling/importance_sampling_ratio/mean": 1.253305435180664, "sampling/importance_sampling_ratio/min": 0.40475034713745117, "sampling/sampling_logp_difference/max": 0.4607217311859131, "sampling/sampling_logp_difference/mean": 0.029603634029626846, "step": 172, "step_time": 79.53238872098154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.33054256439208984, "epoch": 0.346, "frac_reward_zero_std": 0.0, "grad_norm": 1.2542983293533325, "kl": 0.012076465412974358, "learning_rate": 4.740238608347337e-06, "loss": -0.0235, "num_tokens": 968102.0, "reward": 0.4775000214576721, "reward_std": 0.5982934236526489, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5544044971466064, "sampling/importance_sampling_ratio/max": 1.2516276836395264, "sampling/importance_sampling_ratio/mean": 1.0209238529205322, "sampling/importance_sampling_ratio/min": 0.8097511529922485, "sampling/sampling_logp_difference/max": 0.3150825500488281, "sampling/sampling_logp_difference/mean": 0.022141385823488235, "step": 173, "step_time": 67.22070981800789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3209684491157532, "epoch": 0.348, "frac_reward_zero_std": 0.0, "grad_norm": 1.5610387325286865, "kl": 0.007439862936735153, "learning_rate": 4.736632968812374e-06, "loss": -0.0656, "num_tokens": 973329.0, "reward": 0.4699999988079071, "reward_std": 0.612058162689209, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5669718980789185, "sampling/importance_sampling_ratio/max": 2.20025897026062, "sampling/importance_sampling_ratio/mean": 1.2844336032867432, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6814525127410889, "sampling/sampling_logp_difference/mean": 0.029089387506246567, "step": 174, "step_time": 59.10385779500939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3398328423500061, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.8624789714813232, "kl": 0.011860033497214317, "learning_rate": 4.733003868055923e-06, "loss": 0.1904, "num_tokens": 979417.0, "reward": 0.05624999478459358, "reward_std": 0.265840083360672, "rewards/reward_func/mean": 0.05624999478459358, "rewards/reward_func/std": 0.3444223999977112, "sampling/importance_sampling_ratio/max": 1.2150940895080566, "sampling/importance_sampling_ratio/mean": 0.98213791847229, "sampling/importance_sampling_ratio/min": 0.5763043165206909, "sampling/sampling_logp_difference/max": 0.3341519832611084, "sampling/sampling_logp_difference/mean": 0.020348751917481422, "step": 175, "step_time": 88.84378124101204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3720015287399292, "epoch": 0.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.8829252123832703, "kl": 0.013934159651398659, "learning_rate": 4.729351344145536e-06, "loss": -0.0327, "num_tokens": 984863.0, "reward": 0.05249999836087227, "reward_std": 0.30060574412345886, "rewards/reward_func/mean": 0.05249999836087227, "rewards/reward_func/std": 0.38100433349609375, "sampling/importance_sampling_ratio/max": 1.3568812608718872, "sampling/importance_sampling_ratio/mean": 0.8758584260940552, "sampling/importance_sampling_ratio/min": 0.5294094681739807, "sampling/sampling_logp_difference/max": 0.36570286750793457, "sampling/sampling_logp_difference/mean": 0.023136310279369354, "step": 176, "step_time": 95.73476834298344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.36852994561195374, "epoch": 0.354, "frac_reward_zero_std": 0.0, "grad_norm": 1.2951862812042236, "kl": 0.021301502361893654, "learning_rate": 4.725675435394461e-06, "loss": 0.164, "num_tokens": 990337.0, "reward": 0.06875000894069672, "reward_std": 0.2854534685611725, "rewards/reward_func/mean": 0.06875000894069672, "rewards/reward_func/std": 0.37745150923728943, "sampling/importance_sampling_ratio/max": 2.195624589920044, "sampling/importance_sampling_ratio/mean": 0.981530487537384, "sampling/importance_sampling_ratio/min": 0.4619598090648651, "sampling/sampling_logp_difference/max": 0.628758430480957, "sampling/sampling_logp_difference/mean": 0.025460662320256233, "step": 177, "step_time": 67.39933587997803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3247171640396118, "epoch": 0.356, "frac_reward_zero_std": 0.0, "grad_norm": 1.6371877193450928, "kl": 0.012556849978864193, "learning_rate": 4.721976180361239e-06, "loss": 0.075, "num_tokens": 995402.0, "reward": 0.17749999463558197, "reward_std": 0.3520262837409973, "rewards/reward_func/mean": 0.17749999463558197, "rewards/reward_func/std": 0.5018181204795837, "sampling/importance_sampling_ratio/max": 1.499563455581665, "sampling/importance_sampling_ratio/mean": 0.9744052290916443, "sampling/importance_sampling_ratio/min": 0.5791205763816833, "sampling/sampling_logp_difference/max": 0.4286665916442871, "sampling/sampling_logp_difference/mean": 0.023523185402154922, "step": 178, "step_time": 59.054495546006365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3754112720489502, "epoch": 0.358, "frac_reward_zero_std": 0.0, "grad_norm": 1.0154576301574707, "kl": 0.013785503804683685, "learning_rate": 4.718253617849306e-06, "loss": 0.0381, "num_tokens": 1001387.0, "reward": 0.08749999105930328, "reward_std": 0.2781248092651367, "rewards/reward_func/mean": 0.08749999105930328, "rewards/reward_func/std": 0.3700868785381317, "sampling/importance_sampling_ratio/max": 1.0287582874298096, "sampling/importance_sampling_ratio/mean": 0.8582373857498169, "sampling/importance_sampling_ratio/min": 0.6108002066612244, "sampling/sampling_logp_difference/max": 0.3374152183532715, "sampling/sampling_logp_difference/mean": 0.023500245064496994, "step": 179, "step_time": 74.29987861201516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3148415982723236, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.8665428757667542, "kl": 0.014602867886424065, "learning_rate": 4.7145077869065815e-06, "loss": 0.2052, "num_tokens": 1006871.0, "reward": 0.20875000953674316, "reward_std": 0.5284746885299683, "rewards/reward_func/mean": 0.20875000953674316, "rewards/reward_func/std": 0.48932716250419617, "sampling/importance_sampling_ratio/max": 1.6064436435699463, "sampling/importance_sampling_ratio/mean": 0.8494887948036194, "sampling/importance_sampling_ratio/min": 0.28991734981536865, "sampling/sampling_logp_difference/max": 0.5766005516052246, "sampling/sampling_logp_difference/mean": 0.022570453584194183, "step": 180, "step_time": 75.10994843998924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35348600149154663, "epoch": 0.362, "frac_reward_zero_std": 0.0, "grad_norm": 1.372689962387085, "kl": 0.009884382598102093, "learning_rate": 4.710738726825059e-06, "loss": 0.1381, "num_tokens": 1012819.0, "reward": 0.20874999463558197, "reward_std": 0.5279327034950256, "rewards/reward_func/mean": 0.20874999463558197, "rewards/reward_func/std": 0.48888903856277466, "sampling/importance_sampling_ratio/max": 1.244268774986267, "sampling/importance_sampling_ratio/mean": 0.9364046454429626, "sampling/importance_sampling_ratio/min": 0.6309903264045715, "sampling/sampling_logp_difference/max": 0.3161327838897705, "sampling/sampling_logp_difference/mean": 0.02107790857553482, "step": 181, "step_time": 80.6431828700006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3191605806350708, "epoch": 0.364, "frac_reward_zero_std": 0.0, "grad_norm": 1.2262747287750244, "kl": 0.011803516186773777, "learning_rate": 4.706946477140396e-06, "loss": 0.0117, "num_tokens": 1017886.0, "reward": 0.08250000327825546, "reward_std": 0.2791511118412018, "rewards/reward_func/mean": 0.08250000327825546, "rewards/reward_func/std": 0.37247246503829956, "sampling/importance_sampling_ratio/max": 1.2804160118103027, "sampling/importance_sampling_ratio/mean": 0.7288067936897278, "sampling/importance_sampling_ratio/min": 0.4809218645095825, "sampling/sampling_logp_difference/max": 0.35615110397338867, "sampling/sampling_logp_difference/mean": 0.023954380303621292, "step": 182, "step_time": 66.17639043199597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.375, "completions/mean_terminated_length": 41.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.31017425656318665, "epoch": 0.366, "frac_reward_zero_std": 0.0, "grad_norm": 2.0836074352264404, "kl": 0.014769114553928375, "learning_rate": 4.703131077631498e-06, "loss": 0.1999, "num_tokens": 1023314.0, "reward": 0.3387500047683716, "reward_std": 0.2747931182384491, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5456958413124084, "sampling/importance_sampling_ratio/max": 2.228360414505005, "sampling/importance_sampling_ratio/mean": 1.1713675260543823, "sampling/importance_sampling_ratio/min": 0.5424574017524719, "sampling/sampling_logp_difference/max": 0.5427889823913574, "sampling/sampling_logp_difference/mean": 0.02546188049018383, "step": 183, "step_time": 82.83274039000389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 44.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.35528260469436646, "epoch": 0.368, "frac_reward_zero_std": 0.0, "grad_norm": 1.2007426023483276, "kl": 0.006659870967268944, "learning_rate": 4.699292568320097e-06, "loss": -0.0313, "num_tokens": 1028524.0, "reward": 0.32625001668930054, "reward_std": 0.5633938312530518, "rewards/reward_func/mean": 0.32625001668930054, "rewards/reward_func/std": 0.5353753566741943, "sampling/importance_sampling_ratio/max": 1.7591568231582642, "sampling/importance_sampling_ratio/mean": 1.0408813953399658, "sampling/importance_sampling_ratio/min": 0.6595721244812012, "sampling/sampling_logp_difference/max": 0.7866129875183105, "sampling/sampling_logp_difference/mean": 0.021467799320816994, "step": 184, "step_time": 79.17429194701253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.39394837617874146, "epoch": 0.37, "frac_reward_zero_std": 0.0, "grad_norm": 1.294677734375, "kl": 0.015325892716646194, "learning_rate": 4.6954309894703435e-06, "loss": -0.0185, "num_tokens": 1033728.0, "reward": 0.14000000059604645, "reward_std": 0.5303218364715576, "rewards/reward_func/mean": 0.14000000059604645, "rewards/reward_func/std": 0.4927183985710144, "sampling/importance_sampling_ratio/max": 1.6721136569976807, "sampling/importance_sampling_ratio/mean": 0.8524694442749023, "sampling/importance_sampling_ratio/min": 0.3965020775794983, "sampling/sampling_logp_difference/max": 0.49992823600769043, "sampling/sampling_logp_difference/mean": 0.025052586570382118, "step": 185, "step_time": 88.38119602698134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.27076610922813416, "epoch": 0.372, "frac_reward_zero_std": 0.0, "grad_norm": 0.7855340838432312, "kl": 0.01319638080894947, "learning_rate": 4.69154638158837e-06, "loss": 0.0525, "num_tokens": 1039269.0, "reward": 0.3199999928474426, "reward_std": 0.5744900107383728, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.5606883764266968, "sampling/importance_sampling_ratio/max": 1.3193827867507935, "sampling/importance_sampling_ratio/mean": 0.7310043573379517, "sampling/importance_sampling_ratio/min": 0.36375343799591064, "sampling/sampling_logp_difference/max": 0.707329273223877, "sampling/sampling_logp_difference/mean": 0.02303919941186905, "step": 186, "step_time": 46.03551520599285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3389705419540405, "epoch": 0.374, "frac_reward_zero_std": 0.0, "grad_norm": 1.0259521007537842, "kl": 0.0074524343945086, "learning_rate": 4.687638785421875e-06, "loss": 0.0077, "num_tokens": 1046492.0, "reward": 0.1899999976158142, "reward_std": 0.31486421823501587, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.476385235786438, "sampling/importance_sampling_ratio/max": 1.1441234350204468, "sampling/importance_sampling_ratio/mean": 0.8540750741958618, "sampling/importance_sampling_ratio/min": 0.4722847044467926, "sampling/sampling_logp_difference/max": 0.43096935749053955, "sampling/sampling_logp_difference/mean": 0.02116192877292633, "step": 187, "step_time": 108.60295571299503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3361474871635437, "epoch": 0.376, "frac_reward_zero_std": 0.0, "grad_norm": 1.7023673057556152, "kl": 0.01211222168058157, "learning_rate": 4.683708241959694e-06, "loss": -0.4484, "num_tokens": 1052225.0, "reward": 0.20374999940395355, "reward_std": 0.5282833576202393, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.4892541766166687, "sampling/importance_sampling_ratio/max": 1.8190691471099854, "sampling/importance_sampling_ratio/mean": 1.0389931201934814, "sampling/importance_sampling_ratio/min": 0.39706769585609436, "sampling/sampling_logp_difference/max": 0.3256983757019043, "sampling/sampling_logp_difference/mean": 0.022885797545313835, "step": 188, "step_time": 71.87692314898595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3790978193283081, "epoch": 0.378, "frac_reward_zero_std": 0.0, "grad_norm": 1.685671091079712, "kl": 0.02672746405005455, "learning_rate": 4.679754792431368e-06, "loss": -0.3355, "num_tokens": 1057327.0, "reward": 0.3125, "reward_std": 0.592415452003479, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5662848949432373, "sampling/importance_sampling_ratio/max": 2.115366220474243, "sampling/importance_sampling_ratio/mean": 1.1775258779525757, "sampling/importance_sampling_ratio/min": 0.5436846017837524, "sampling/sampling_logp_difference/max": 0.46004533767700195, "sampling/sampling_logp_difference/mean": 0.02344960719347, "step": 189, "step_time": 89.23043878999306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.34846365451812744, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 1.1112720966339111, "kl": 0.01807713694870472, "learning_rate": 4.675778478306712e-06, "loss": 0.1345, "num_tokens": 1062997.0, "reward": 0.21125000715255737, "reward_std": 0.5194716453552246, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.4811723232269287, "sampling/importance_sampling_ratio/max": 1.4994481801986694, "sampling/importance_sampling_ratio/mean": 1.0004935264587402, "sampling/importance_sampling_ratio/min": 0.4650833010673523, "sampling/sampling_logp_difference/max": 0.5518231391906738, "sampling/sampling_logp_difference/mean": 0.02722608856856823, "step": 190, "step_time": 80.97714860000997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3735997676849365, "epoch": 0.382, "frac_reward_zero_std": 0.0, "grad_norm": 1.2785558700561523, "kl": 0.018139660358428955, "learning_rate": 4.671779341295378e-06, "loss": 0.1762, "num_tokens": 1067953.0, "reward": 0.20374999940395355, "reward_std": 0.5309150815010071, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.49260058999061584, "sampling/importance_sampling_ratio/max": 1.8519113063812256, "sampling/importance_sampling_ratio/mean": 1.313336730003357, "sampling/importance_sampling_ratio/min": 0.7735275626182556, "sampling/sampling_logp_difference/max": 0.33982229232788086, "sampling/sampling_logp_difference/mean": 0.028083689510822296, "step": 191, "step_time": 77.81035732399323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3238619267940521, "epoch": 0.384, "frac_reward_zero_std": 0.0, "grad_norm": 1.0245407819747925, "kl": 0.014340454712510109, "learning_rate": 4.667757423346423e-06, "loss": 0.0233, "num_tokens": 1072876.0, "reward": 0.3050000071525574, "reward_std": 0.6045562028884888, "rewards/reward_func/mean": 0.3050000071525574, "rewards/reward_func/std": 0.574978232383728, "sampling/importance_sampling_ratio/max": 1.4169648885726929, "sampling/importance_sampling_ratio/mean": 0.9852313995361328, "sampling/importance_sampling_ratio/min": 0.6186890602111816, "sampling/sampling_logp_difference/max": 0.32411623001098633, "sampling/sampling_logp_difference/mean": 0.021142879500985146, "step": 192, "step_time": 65.84021737999865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.335887610912323, "epoch": 0.386, "frac_reward_zero_std": 0.0, "grad_norm": 1.4161148071289062, "kl": 0.013850709423422813, "learning_rate": 4.663712766647862e-06, "loss": -0.0187, "num_tokens": 1079270.0, "reward": 0.17625001072883606, "reward_std": 0.34232813119888306, "rewards/reward_func/mean": 0.17625001072883606, "rewards/reward_func/std": 0.49956947565078735, "sampling/importance_sampling_ratio/max": 1.827757716178894, "sampling/importance_sampling_ratio/mean": 1.0969743728637695, "sampling/importance_sampling_ratio/min": 0.5536801815032959, "sampling/sampling_logp_difference/max": 0.36859893798828125, "sampling/sampling_logp_difference/mean": 0.023405691608786583, "step": 193, "step_time": 109.58445479700458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.31033533811569214, "epoch": 0.388, "frac_reward_zero_std": 0.0, "grad_norm": 1.027238368988037, "kl": 0.016297120600938797, "learning_rate": 4.65964541362623e-06, "loss": -0.0868, "num_tokens": 1084716.0, "reward": 0.3387500047683716, "reward_std": 0.5553901791572571, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5315056443214417, "sampling/importance_sampling_ratio/max": 1.3392726182937622, "sampling/importance_sampling_ratio/mean": 1.071367859840393, "sampling/importance_sampling_ratio/min": 0.7315554022789001, "sampling/sampling_logp_difference/max": 0.49803805351257324, "sampling/sampling_logp_difference/mean": 0.02089758589863777, "step": 194, "step_time": 67.651659035997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 44.125, "completions/mean_terminated_length": 44.125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.3812289834022522, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 1.7000164985656738, "kl": 0.01980014517903328, "learning_rate": 4.655555406946135e-06, "loss": -0.1177, "num_tokens": 1089906.0, "reward": 0.32500001788139343, "reward_std": 0.5569195747375488, "rewards/reward_func/mean": 0.32500001788139343, "rewards/reward_func/std": 0.5433756709098816, "sampling/importance_sampling_ratio/max": 1.6265194416046143, "sampling/importance_sampling_ratio/mean": 1.0881175994873047, "sampling/importance_sampling_ratio/min": 0.6452130675315857, "sampling/sampling_logp_difference/max": 0.3572232723236084, "sampling/sampling_logp_difference/mean": 0.023923953995108604, "step": 195, "step_time": 64.9109322100121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.37043747305870056, "epoch": 0.392, "frac_reward_zero_std": 0.0, "grad_norm": 2.687000036239624, "kl": 0.022207628935575485, "learning_rate": 4.651442789509813e-06, "loss": 0.7241, "num_tokens": 1095253.0, "reward": -0.03999999910593033, "reward_std": 0.04871772229671478, "rewards/reward_func/mean": -0.03999999910593033, "rewards/reward_func/std": 0.05182388052344322, "sampling/importance_sampling_ratio/max": 2.730234146118164, "sampling/importance_sampling_ratio/mean": 1.2118090391159058, "sampling/importance_sampling_ratio/min": 0.32341212034225464, "sampling/sampling_logp_difference/max": 0.3899533748626709, "sampling/sampling_logp_difference/mean": 0.027441177517175674, "step": 196, "step_time": 88.97580981699866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.4157707095146179, "epoch": 0.394, "frac_reward_zero_std": 0.0, "grad_norm": 1.3999090194702148, "kl": 0.017077336087822914, "learning_rate": 4.647307604456675e-06, "loss": 0.1207, "num_tokens": 1101561.0, "reward": 0.07624999433755875, "reward_std": 0.2700246274471283, "rewards/reward_func/mean": 0.07624999433755875, "rewards/reward_func/std": 0.35860592126846313, "sampling/importance_sampling_ratio/max": 1.3688700199127197, "sampling/importance_sampling_ratio/mean": 0.9039748907089233, "sampling/importance_sampling_ratio/min": 0.5610687732696533, "sampling/sampling_logp_difference/max": 0.30984562635421753, "sampling/sampling_logp_difference/mean": 0.026080135256052017, "step": 197, "step_time": 79.29961144700064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35132038593292236, "epoch": 0.396, "frac_reward_zero_std": 0.0, "grad_norm": 1.0895273685455322, "kl": 0.014003671705722809, "learning_rate": 4.643149895162854e-06, "loss": -0.0698, "num_tokens": 1106835.0, "reward": 0.1850000023841858, "reward_std": 0.3318837285041809, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.4803272783756256, "sampling/importance_sampling_ratio/max": 1.1942050457000732, "sampling/importance_sampling_ratio/mean": 0.8434613943099976, "sampling/importance_sampling_ratio/min": 0.36741903424263, "sampling/sampling_logp_difference/max": 0.5363888740539551, "sampling/sampling_logp_difference/mean": 0.026127520948648453, "step": 198, "step_time": 85.95462637199671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.34718334674835205, "epoch": 0.398, "frac_reward_zero_std": 0.0, "grad_norm": 1.287834882736206, "kl": 0.014416320249438286, "learning_rate": 4.6389697052407535e-06, "loss": -0.0184, "num_tokens": 1112538.0, "reward": 0.07625000178813934, "reward_std": 0.27981066703796387, "rewards/reward_func/mean": 0.07625000178813934, "rewards/reward_func/std": 0.3634335398674011, "sampling/importance_sampling_ratio/max": 1.8455092906951904, "sampling/importance_sampling_ratio/mean": 1.0616990327835083, "sampling/importance_sampling_ratio/min": 0.7087575197219849, "sampling/sampling_logp_difference/max": 0.43187177181243896, "sampling/sampling_logp_difference/mean": 0.02503993548452854, "step": 199, "step_time": 70.96149959298782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3079107701778412, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 1.0146771669387817, "kl": 0.013202982023358345, "learning_rate": 4.634767078538589e-06, "loss": -0.1111, "num_tokens": 1118132.0, "reward": 0.32249999046325684, "reward_std": 0.550460934638977, "rewards/reward_func/mean": 0.32249999046325684, "rewards/reward_func/std": 0.5314871072769165, "sampling/importance_sampling_ratio/max": 1.090162992477417, "sampling/importance_sampling_ratio/mean": 0.7846779823303223, "sampling/importance_sampling_ratio/min": 0.5243642330169678, "sampling/sampling_logp_difference/max": 0.552169919013977, "sampling/sampling_logp_difference/mean": 0.019852038472890854, "step": 200, "step_time": 77.93985611898825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.38525718450546265, "epoch": 0.402, "frac_reward_zero_std": 0.0, "grad_norm": 1.3107483386993408, "kl": 0.020291147753596306, "learning_rate": 4.630542059139923e-06, "loss": 0.0876, "num_tokens": 1123554.0, "reward": 0.44999998807907104, "reward_std": 0.5826581716537476, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.5397353768348694, "sampling/importance_sampling_ratio/max": 1.5795011520385742, "sampling/importance_sampling_ratio/mean": 0.9796627163887024, "sampling/importance_sampling_ratio/min": 0.26825037598609924, "sampling/sampling_logp_difference/max": 0.42403650283813477, "sampling/sampling_logp_difference/mean": 0.02384255826473236, "step": 201, "step_time": 64.71107027700054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3214360177516937, "epoch": 0.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.9233896732330322, "kl": 0.01243473682552576, "learning_rate": 4.626294691363213e-06, "loss": -0.006, "num_tokens": 1129228.0, "reward": 0.19374999403953552, "reward_std": 0.5165963172912598, "rewards/reward_func/mean": 0.19374999403953552, "rewards/reward_func/std": 0.4797302186489105, "sampling/importance_sampling_ratio/max": 1.7446887493133545, "sampling/importance_sampling_ratio/mean": 1.0539482831954956, "sampling/importance_sampling_ratio/min": 0.6484421491622925, "sampling/sampling_logp_difference/max": 0.3515496253967285, "sampling/sampling_logp_difference/mean": 0.02195613458752632, "step": 202, "step_time": 65.19753580997349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3714994192123413, "epoch": 0.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.8211506009101868, "kl": 0.010113951750099659, "learning_rate": 4.622025019761336e-06, "loss": 0.0358, "num_tokens": 1134606.0, "reward": 0.19249999523162842, "reward_std": 0.5170982480049133, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.47927772998809814, "sampling/importance_sampling_ratio/max": 2.0278983116149902, "sampling/importance_sampling_ratio/mean": 0.9206888675689697, "sampling/importance_sampling_ratio/min": 0.5617751479148865, "sampling/sampling_logp_difference/max": 0.347994327545166, "sampling/sampling_logp_difference/mean": 0.02045644447207451, "step": 203, "step_time": 71.5089026770147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.3702104687690735, "epoch": 0.408, "frac_reward_zero_std": 0.0, "grad_norm": 1.3650530576705933, "kl": 0.030135968700051308, "learning_rate": 4.617733089121127e-06, "loss": 0.2188, "num_tokens": 1139666.0, "reward": 0.07249999791383743, "reward_std": 0.29401281476020813, "rewards/reward_func/mean": 0.07249999791383743, "rewards/reward_func/std": 0.37803059816360474, "sampling/importance_sampling_ratio/max": 1.2655539512634277, "sampling/importance_sampling_ratio/mean": 0.7001688480377197, "sampling/importance_sampling_ratio/min": 0.36836326122283936, "sampling/sampling_logp_difference/max": 0.5306470394134521, "sampling/sampling_logp_difference/mean": 0.030222740024328232, "step": 204, "step_time": 76.82139863798511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3269600570201874, "epoch": 0.41, "frac_reward_zero_std": 0.0, "grad_norm": 1.2307133674621582, "kl": 0.010514447465538979, "learning_rate": 4.613418944462907e-06, "loss": 0.0782, "num_tokens": 1145168.0, "reward": 0.33249998092651367, "reward_std": 0.5493869781494141, "rewards/reward_func/mean": 0.33249998092651367, "rewards/reward_func/std": 0.5238797664642334, "sampling/importance_sampling_ratio/max": 1.906840443611145, "sampling/importance_sampling_ratio/mean": 1.0142680406570435, "sampling/importance_sampling_ratio/min": 0.42224809527397156, "sampling/sampling_logp_difference/max": 0.7108626365661621, "sampling/sampling_logp_difference/mean": 0.02854611724615097, "step": 205, "step_time": 78.07254538699635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.39792150259017944, "epoch": 0.412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3023905754089355, "kl": 0.02957381308078766, "learning_rate": 4.609082631040012e-06, "loss": -0.0722, "num_tokens": 1150370.0, "reward": 0.3500000238418579, "reward_std": 0.5520753264427185, "rewards/reward_func/mean": 0.3500000238418579, "rewards/reward_func/std": 0.5316282510757446, "sampling/importance_sampling_ratio/max": 1.5235435962677002, "sampling/importance_sampling_ratio/mean": 0.9376378655433655, "sampling/importance_sampling_ratio/min": 0.33686015009880066, "sampling/sampling_logp_difference/max": 0.342923641204834, "sampling/sampling_logp_difference/mean": 0.027645057067275047, "step": 206, "step_time": 63.247660350985825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.35857129096984863, "epoch": 0.414, "frac_reward_zero_std": 0.0, "grad_norm": 1.0902588367462158, "kl": 0.009508270770311356, "learning_rate": 4.604724194338318e-06, "loss": 0.0542, "num_tokens": 1155624.0, "reward": 0.48500001430511475, "reward_std": 0.5167855620384216, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5455272793769836, "sampling/importance_sampling_ratio/max": 2.0466551780700684, "sampling/importance_sampling_ratio/mean": 1.1512298583984375, "sampling/importance_sampling_ratio/min": 0.4752918779850006, "sampling/sampling_logp_difference/max": 0.3554987907409668, "sampling/sampling_logp_difference/mean": 0.02029740810394287, "step": 207, "step_time": 50.03297023801133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3821730315685272, "epoch": 0.416, "frac_reward_zero_std": 0.0, "grad_norm": 1.3871904611587524, "kl": 0.01627778261899948, "learning_rate": 4.600343680075764e-06, "loss": -0.1631, "num_tokens": 1161217.0, "reward": 0.5975000262260437, "reward_std": 0.5603698492050171, "rewards/reward_func/mean": 0.5975000262260437, "rewards/reward_func/std": 0.5395169854164124, "sampling/importance_sampling_ratio/max": 2.740676164627075, "sampling/importance_sampling_ratio/mean": 1.3194831609725952, "sampling/importance_sampling_ratio/min": 0.6237443685531616, "sampling/sampling_logp_difference/max": 0.33385396003723145, "sampling/sampling_logp_difference/mean": 0.023148780688643456, "step": 208, "step_time": 48.81324854400009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.4071102738380432, "epoch": 0.418, "frac_reward_zero_std": 0.0, "grad_norm": 1.3003276586532593, "kl": 0.015321669168770313, "learning_rate": 4.5959411342018715e-06, "loss": 0.0384, "num_tokens": 1166266.0, "reward": 0.21250000596046448, "reward_std": 0.31011754274368286, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.48414137959480286, "sampling/importance_sampling_ratio/max": 1.357431173324585, "sampling/importance_sampling_ratio/mean": 0.9291549921035767, "sampling/importance_sampling_ratio/min": 0.40400320291519165, "sampling/sampling_logp_difference/max": 0.3345675468444824, "sampling/sampling_logp_difference/mean": 0.02914167195558548, "step": 209, "step_time": 58.75232109500212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 44.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3340144157409668, "epoch": 0.42, "frac_reward_zero_std": 0.0, "grad_norm": 1.2524479627609253, "kl": 0.009892448782920837, "learning_rate": 4.591516602897263e-06, "loss": 0.0476, "num_tokens": 1171977.0, "reward": 0.20249998569488525, "reward_std": 0.5303123593330383, "rewards/reward_func/mean": 0.20249998569488525, "rewards/reward_func/std": 0.491288423538208, "sampling/importance_sampling_ratio/max": 1.9688581228256226, "sampling/importance_sampling_ratio/mean": 0.9722031354904175, "sampling/importance_sampling_ratio/min": 0.4560692310333252, "sampling/sampling_logp_difference/max": 0.47432082891464233, "sampling/sampling_logp_difference/mean": 0.024021849036216736, "step": 210, "step_time": 66.03959386100178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.36171913146972656, "epoch": 0.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.9004639387130737, "kl": 0.01368255726993084, "learning_rate": 4.587070132573178e-06, "loss": -0.099, "num_tokens": 1178226.0, "reward": 0.3137499988079071, "reward_std": 0.5728945732116699, "rewards/reward_func/mean": 0.3137499988079071, "rewards/reward_func/std": 0.5473296046257019, "sampling/importance_sampling_ratio/max": 1.6000454425811768, "sampling/importance_sampling_ratio/mean": 0.8045486211776733, "sampling/importance_sampling_ratio/min": 0.18612989783287048, "sampling/sampling_logp_difference/max": 0.4911985397338867, "sampling/sampling_logp_difference/mean": 0.02342919073998928, "step": 211, "step_time": 82.29735374101438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.625, "completions/mean_terminated_length": 55.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3647511601448059, "epoch": 0.424, "frac_reward_zero_std": 0.0, "grad_norm": 1.4303269386291504, "kl": 0.014668257907032967, "learning_rate": 4.582601769870988e-06, "loss": -0.0589, "num_tokens": 1183454.0, "reward": 0.05624999478459358, "reward_std": 0.28012219071388245, "rewards/reward_func/mean": 0.05624999478459358, "rewards/reward_func/std": 0.36660364270210266, "sampling/importance_sampling_ratio/max": 1.5942217111587524, "sampling/importance_sampling_ratio/mean": 1.0475343465805054, "sampling/importance_sampling_ratio/min": 0.5613923072814941, "sampling/sampling_logp_difference/max": 0.30344557762145996, "sampling/sampling_logp_difference/mean": 0.02498428151011467, "step": 212, "step_time": 81.58139562400174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.32731741666793823, "epoch": 0.426, "frac_reward_zero_std": 0.0, "grad_norm": 1.9570350646972656, "kl": 0.01391584612429142, "learning_rate": 4.578111561661702e-06, "loss": -0.0079, "num_tokens": 1188684.0, "reward": 0.20625001192092896, "reward_std": 0.31164175271987915, "rewards/reward_func/mean": 0.20625001192092896, "rewards/reward_func/std": 0.4845598340034485, "sampling/importance_sampling_ratio/max": 2.221400022506714, "sampling/importance_sampling_ratio/mean": 1.2576444149017334, "sampling/importance_sampling_ratio/min": 0.4654132127761841, "sampling/sampling_logp_difference/max": 0.3340733051300049, "sampling/sampling_logp_difference/mean": 0.02158265747129917, "step": 213, "step_time": 62.86962395600858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 44.875, "completions/mean_terminated_length": 44.875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3720092177391052, "epoch": 0.428, "frac_reward_zero_std": 0.0, "grad_norm": 2.0819950103759766, "kl": 0.028701618313789368, "learning_rate": 4.57359955504548e-06, "loss": -0.2403, "num_tokens": 1194175.0, "reward": 0.059999994933605194, "reward_std": 0.2900388836860657, "rewards/reward_func/mean": 0.059999994933605194, "rewards/reward_func/std": 0.3795486092567444, "sampling/importance_sampling_ratio/max": 1.9808765649795532, "sampling/importance_sampling_ratio/mean": 0.9989551305770874, "sampling/importance_sampling_ratio/min": 0.3208101689815521, "sampling/sampling_logp_difference/max": 0.43723440170288086, "sampling/sampling_logp_difference/mean": 0.02771320939064026, "step": 214, "step_time": 80.43211394100217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 44.125, "completions/mean_terminated_length": 44.125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.3983091115951538, "epoch": 0.43, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234328508377075, "kl": 0.017359508201479912, "learning_rate": 4.569065797351135e-06, "loss": 0.03, "num_tokens": 1200200.0, "reward": 0.07500000298023224, "reward_std": 0.28411543369293213, "rewards/reward_func/mean": 0.07500000298023224, "rewards/reward_func/std": 0.3648875057697296, "sampling/importance_sampling_ratio/max": 1.542760968208313, "sampling/importance_sampling_ratio/mean": 0.9468032717704773, "sampling/importance_sampling_ratio/min": 0.331230491399765, "sampling/sampling_logp_difference/max": 0.34746503829956055, "sampling/sampling_logp_difference/mean": 0.023643018677830696, "step": 215, "step_time": 68.54731415698188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.32628491520881653, "epoch": 0.432, "frac_reward_zero_std": 0.0, "grad_norm": 1.16749107837677, "kl": 0.009836241602897644, "learning_rate": 4.564510336135642e-06, "loss": -0.1468, "num_tokens": 1205836.0, "reward": 0.3449999988079071, "reward_std": 0.5644152164459229, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5431390404701233, "sampling/importance_sampling_ratio/max": 1.6908862590789795, "sampling/importance_sampling_ratio/mean": 1.001993179321289, "sampling/importance_sampling_ratio/min": 0.4759381413459778, "sampling/sampling_logp_difference/max": 0.45307183265686035, "sampling/sampling_logp_difference/mean": 0.023161139339208603, "step": 216, "step_time": 64.80334895499982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.40457984805107117, "epoch": 0.434, "frac_reward_zero_std": 0.0, "grad_norm": 2.506805181503296, "kl": 0.02346435934305191, "learning_rate": 4.559933219183631e-06, "loss": 0.1214, "num_tokens": 1211436.0, "reward": 0.07000000774860382, "reward_std": 0.28503167629241943, "rewards/reward_func/mean": 0.07000000774860382, "rewards/reward_func/std": 0.36613819003105164, "sampling/importance_sampling_ratio/max": 2.0334134101867676, "sampling/importance_sampling_ratio/mean": 1.1956446170806885, "sampling/importance_sampling_ratio/min": 0.4909520149230957, "sampling/sampling_logp_difference/max": 0.3521122932434082, "sampling/sampling_logp_difference/mean": 0.023371540009975433, "step": 217, "step_time": 77.5817707440001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3373414874076843, "epoch": 0.436, "frac_reward_zero_std": 0.0, "grad_norm": 1.612067461013794, "kl": 0.02040504291653633, "learning_rate": 4.555334494506895e-06, "loss": 0.0114, "num_tokens": 1216591.0, "reward": 0.3100000023841858, "reward_std": 0.2696232199668884, "rewards/reward_func/mean": 0.3100000023841858, "rewards/reward_func/std": 0.525221049785614, "sampling/importance_sampling_ratio/max": 2.0312304496765137, "sampling/importance_sampling_ratio/mean": 1.0194265842437744, "sampling/importance_sampling_ratio/min": 0.5188978910446167, "sampling/sampling_logp_difference/max": 0.5666763782501221, "sampling/sampling_logp_difference/mean": 0.023367371410131454, "step": 218, "step_time": 83.6308395829983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 45.125, "completions/mean_terminated_length": 45.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3169403672218323, "epoch": 0.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.9353013038635254, "kl": 0.058528319001197815, "learning_rate": 4.550714210343879e-06, "loss": 0.0259, "num_tokens": 1222212.0, "reward": 0.45375001430511475, "reward_std": 0.5997226238250732, "rewards/reward_func/mean": 0.45375001430511475, "rewards/reward_func/std": 0.5565438866615295, "sampling/importance_sampling_ratio/max": 1.582319736480713, "sampling/importance_sampling_ratio/mean": 0.8256447315216064, "sampling/importance_sampling_ratio/min": 0.30875927209854126, "sampling/sampling_logp_difference/max": 0.9188776016235352, "sampling/sampling_logp_difference/mean": 0.023561663925647736, "step": 219, "step_time": 80.09332663999521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 44.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.36092275381088257, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.6617423892021179, "kl": 0.00997140072286129, "learning_rate": 4.546072415159179e-06, "loss": 0.145, "num_tokens": 1227779.0, "reward": 0.32749998569488525, "reward_std": 0.578244149684906, "rewards/reward_func/mean": 0.32749998569488525, "rewards/reward_func/std": 0.5523391962051392, "sampling/importance_sampling_ratio/max": 1.2815821170806885, "sampling/importance_sampling_ratio/mean": 0.6444365978240967, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1129623651504517, "sampling/sampling_logp_difference/mean": 0.029664166271686554, "step": 220, "step_time": 67.70866433801712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.38162463903427124, "epoch": 0.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828568458557129, "kl": 0.009829282760620117, "learning_rate": 4.541409157643027e-06, "loss": -0.0023, "num_tokens": 1232951.0, "reward": 0.20249998569488525, "reward_std": 0.3496881127357483, "rewards/reward_func/mean": 0.20249998569488525, "rewards/reward_func/std": 0.48948225378990173, "sampling/importance_sampling_ratio/max": 1.5951018333435059, "sampling/importance_sampling_ratio/mean": 1.0153491497039795, "sampling/importance_sampling_ratio/min": 0.5687949657440186, "sampling/sampling_logp_difference/max": 0.5899345278739929, "sampling/sampling_logp_difference/mean": 0.025079842656850815, "step": 221, "step_time": 59.01676659900113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.34739094972610474, "epoch": 0.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.8717793226242065, "kl": 0.013192622922360897, "learning_rate": 4.5367244867107905e-06, "loss": 0.1667, "num_tokens": 1238183.0, "reward": 0.0612499974668026, "reward_std": 0.28369978070259094, "rewards/reward_func/mean": 0.0612499974668026, "rewards/reward_func/std": 0.37635233998298645, "sampling/importance_sampling_ratio/max": 1.736910343170166, "sampling/importance_sampling_ratio/mean": 0.845312237739563, "sampling/importance_sampling_ratio/min": 0.32411935925483704, "sampling/sampling_logp_difference/max": 0.4334859848022461, "sampling/sampling_logp_difference/mean": 0.02428375370800495, "step": 222, "step_time": 75.28331548400456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3791123628616333, "epoch": 0.446, "frac_reward_zero_std": 0.0, "grad_norm": 1.7146968841552734, "kl": 0.009486062452197075, "learning_rate": 4.53201845150245e-06, "loss": -0.0391, "num_tokens": 1244209.0, "reward": 0.1850000023841858, "reward_std": 0.4920302927494049, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.45610150694847107, "sampling/importance_sampling_ratio/max": 1.846500039100647, "sampling/importance_sampling_ratio/mean": 1.1525156497955322, "sampling/importance_sampling_ratio/min": 0.5845286250114441, "sampling/sampling_logp_difference/max": 0.3337571620941162, "sampling/sampling_logp_difference/mean": 0.023619763553142548, "step": 223, "step_time": 73.09225799201522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3343263268470764, "epoch": 0.448, "frac_reward_zero_std": 0.0, "grad_norm": 1.0479278564453125, "kl": 0.015914462506771088, "learning_rate": 4.527291101382088e-06, "loss": -0.0026, "num_tokens": 1249545.0, "reward": 0.45125001668930054, "reward_std": 0.6346049904823303, "rewards/reward_func/mean": 0.45125001668930054, "rewards/reward_func/std": 0.5894412994384766, "sampling/importance_sampling_ratio/max": 1.4512196779251099, "sampling/importance_sampling_ratio/mean": 0.9548776149749756, "sampling/importance_sampling_ratio/min": 0.34800758957862854, "sampling/sampling_logp_difference/max": 0.516020655632019, "sampling/sampling_logp_difference/mean": 0.019098889082670212, "step": 224, "step_time": 61.251870251988294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35256800055503845, "epoch": 0.45, "frac_reward_zero_std": 0.0, "grad_norm": 1.4217804670333862, "kl": 0.021977337077260017, "learning_rate": 4.522542485937369e-06, "loss": 0.4031, "num_tokens": 1255140.0, "reward": -0.06624999642372131, "reward_std": 0.04499492794275284, "rewards/reward_func/mean": -0.06624999642372131, "rewards/reward_func/std": 0.07818248122930527, "sampling/importance_sampling_ratio/max": 1.9071402549743652, "sampling/importance_sampling_ratio/mean": 0.9328581094741821, "sampling/importance_sampling_ratio/min": 0.30502116680145264, "sampling/sampling_logp_difference/max": 0.640667200088501, "sampling/sampling_logp_difference/mean": 0.02563471347093582, "step": 225, "step_time": 75.17881314299302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35993432998657227, "epoch": 0.452, "frac_reward_zero_std": 0.0, "grad_norm": 1.419751524925232, "kl": 0.011443949304521084, "learning_rate": 4.517772654979024e-06, "loss": -0.1155, "num_tokens": 1261099.0, "reward": 0.32375001907348633, "reward_std": 0.5461503267288208, "rewards/reward_func/mean": 0.32375001907348633, "rewards/reward_func/std": 0.5280405282974243, "sampling/importance_sampling_ratio/max": 1.3371970653533936, "sampling/importance_sampling_ratio/mean": 0.980187714099884, "sampling/importance_sampling_ratio/min": 0.6122799515724182, "sampling/sampling_logp_difference/max": 0.3190453052520752, "sampling/sampling_logp_difference/mean": 0.0227971188724041, "step": 226, "step_time": 74.47182274601073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.38980555534362793, "epoch": 0.454, "frac_reward_zero_std": 0.0, "grad_norm": 1.9645390510559082, "kl": 0.03210742026567459, "learning_rate": 4.512981658540321e-06, "loss": -0.2877, "num_tokens": 1266504.0, "reward": 0.32624998688697815, "reward_std": 0.5668675899505615, "rewards/reward_func/mean": 0.32624998688697815, "rewards/reward_func/std": 0.5424794554710388, "sampling/importance_sampling_ratio/max": 1.9740383625030518, "sampling/importance_sampling_ratio/mean": 0.9537367224693298, "sampling/importance_sampling_ratio/min": 0.35949572920799255, "sampling/sampling_logp_difference/max": 0.7103188037872314, "sampling/sampling_logp_difference/mean": 0.03307211026549339, "step": 227, "step_time": 57.815353090001736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 41.625, "completions/mean_terminated_length": 41.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.2987158000469208, "epoch": 0.456, "frac_reward_zero_std": 0.0, "grad_norm": 1.449369192123413, "kl": 0.01764463633298874, "learning_rate": 4.508169546876547e-06, "loss": 0.1858, "num_tokens": 1272180.0, "reward": 0.19750000536441803, "reward_std": 0.30442947149276733, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.4862612783908844, "sampling/importance_sampling_ratio/max": 1.3735835552215576, "sampling/importance_sampling_ratio/mean": 0.7681852579116821, "sampling/importance_sampling_ratio/min": 0.3591448962688446, "sampling/sampling_logp_difference/max": 0.4523334801197052, "sampling/sampling_logp_difference/mean": 0.02623111382126808, "step": 228, "step_time": 62.75833543899353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3362084925174713, "epoch": 0.458, "frac_reward_zero_std": 0.0, "grad_norm": 1.9343422651290894, "kl": 0.01823657564818859, "learning_rate": 4.503336370464476e-06, "loss": -0.2018, "num_tokens": 1277910.0, "reward": 0.09624999761581421, "reward_std": 0.27238762378692627, "rewards/reward_func/mean": 0.09624999761581421, "rewards/reward_func/std": 0.3667204976081848, "sampling/importance_sampling_ratio/max": 2.1912035942077637, "sampling/importance_sampling_ratio/mean": 1.1063485145568848, "sampling/importance_sampling_ratio/min": 0.4857397675514221, "sampling/sampling_logp_difference/max": 0.8033664226531982, "sampling/sampling_logp_difference/mean": 0.022801101207733154, "step": 229, "step_time": 79.13566814499791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.37015005946159363, "epoch": 0.46, "frac_reward_zero_std": 0.0, "grad_norm": 1.9363261461257935, "kl": 0.019413193687796593, "learning_rate": 4.49848218000184e-06, "loss": -0.2319, "num_tokens": 1284156.0, "reward": 0.19249999523162842, "reward_std": 0.33216795325279236, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.4885766804218292, "sampling/importance_sampling_ratio/max": 2.352567195892334, "sampling/importance_sampling_ratio/mean": 1.2104213237762451, "sampling/importance_sampling_ratio/min": 0.4038701057434082, "sampling/sampling_logp_difference/max": 0.38236117362976074, "sampling/sampling_logp_difference/mean": 0.028649557381868362, "step": 230, "step_time": 72.49902504199417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3422005772590637, "epoch": 0.462, "frac_reward_zero_std": 0.0, "grad_norm": 1.6006170511245728, "kl": 0.025375576689839363, "learning_rate": 4.493607026406802e-06, "loss": 0.1486, "num_tokens": 1289211.0, "reward": 0.5887500047683716, "reward_std": 0.5644031763076782, "rewards/reward_func/mean": 0.5887500047683716, "rewards/reward_func/std": 0.538023829460144, "sampling/importance_sampling_ratio/max": 1.8998997211456299, "sampling/importance_sampling_ratio/mean": 0.896106481552124, "sampling/importance_sampling_ratio/min": 0.4045734107494354, "sampling/sampling_logp_difference/max": 0.6570481061935425, "sampling/sampling_logp_difference/mean": 0.02584882825613022, "step": 231, "step_time": 43.68991583000752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3895803689956665, "epoch": 0.464, "frac_reward_zero_std": 0.0, "grad_norm": 1.428395390510559, "kl": 0.036273203790187836, "learning_rate": 4.488710960817416e-06, "loss": -0.0911, "num_tokens": 1294840.0, "reward": 0.3199999928474426, "reward_std": 0.2673959732055664, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.515225350856781, "sampling/importance_sampling_ratio/max": 2.268913984298706, "sampling/importance_sampling_ratio/mean": 1.0703096389770508, "sampling/importance_sampling_ratio/min": 0.59864342212677, "sampling/sampling_logp_difference/max": 0.3564453125, "sampling/sampling_logp_difference/mean": 0.026173098012804985, "step": 232, "step_time": 55.16194109900971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 42.375, "completions/mean_terminated_length": 42.375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.3868124783039093, "epoch": 0.466, "frac_reward_zero_std": 0.0, "grad_norm": 1.3841030597686768, "kl": 0.05138152837753296, "learning_rate": 4.483794034591092e-06, "loss": -0.0381, "num_tokens": 1299943.0, "reward": 0.20875000953674316, "reward_std": 0.3170267939567566, "rewards/reward_func/mean": 0.20875000953674316, "rewards/reward_func/std": 0.47408372163772583, "sampling/importance_sampling_ratio/max": 1.1332757472991943, "sampling/importance_sampling_ratio/mean": 0.8772479891777039, "sampling/importance_sampling_ratio/min": 0.6085068583488464, "sampling/sampling_logp_difference/max": 0.6103432178497314, "sampling/sampling_logp_difference/mean": 0.03235594183206558, "step": 233, "step_time": 57.82354362300248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3302072584629059, "epoch": 0.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921053051948547, "kl": 0.01756385527551174, "learning_rate": 4.4788562993040615e-06, "loss": -0.009, "num_tokens": 1305391.0, "reward": 0.32124999165534973, "reward_std": 0.570686936378479, "rewards/reward_func/mean": 0.32124999165534973, "rewards/reward_func/std": 0.5467158555984497, "sampling/importance_sampling_ratio/max": 1.3551892042160034, "sampling/importance_sampling_ratio/mean": 0.8291321992874146, "sampling/importance_sampling_ratio/min": 0.32071855664253235, "sampling/sampling_logp_difference/max": 0.42920511960983276, "sampling/sampling_logp_difference/mean": 0.021804213523864746, "step": 234, "step_time": 75.09231540199835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.125, "completions/mean_terminated_length": 41.125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.38166582584381104, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 1.6319152116775513, "kl": 0.03030114620923996, "learning_rate": 4.473897806750829e-06, "loss": -0.0721, "num_tokens": 1311091.0, "reward": 0.05250000208616257, "reward_std": 0.29743990302085876, "rewards/reward_func/mean": 0.05250000208616257, "rewards/reward_func/std": 0.3858848810195923, "sampling/importance_sampling_ratio/max": 1.8289971351623535, "sampling/importance_sampling_ratio/mean": 0.8988316059112549, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.950005054473877, "sampling/sampling_logp_difference/mean": 0.03348758816719055, "step": 235, "step_time": 76.2468694190029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.35070234537124634, "epoch": 0.472, "frac_reward_zero_std": 0.0, "grad_norm": 1.1173913478851318, "kl": 0.03465205430984497, "learning_rate": 4.4689186089436365e-06, "loss": -0.0474, "num_tokens": 1316336.0, "reward": 0.2150000035762787, "reward_std": 0.3214074671268463, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.485386461019516, "sampling/importance_sampling_ratio/max": 1.4250974655151367, "sampling/importance_sampling_ratio/mean": 0.7525547742843628, "sampling/importance_sampling_ratio/min": 0.2883736193180084, "sampling/sampling_logp_difference/max": 0.680816650390625, "sampling/sampling_logp_difference/mean": 0.024951238185167313, "step": 236, "step_time": 45.73709376499755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34809672832489014, "epoch": 0.474, "frac_reward_zero_std": 0.0, "grad_norm": 1.1273499727249146, "kl": 0.026207346469163895, "learning_rate": 4.463918758111912e-06, "loss": 0.1471, "num_tokens": 1322121.0, "reward": -0.03500000014901161, "reward_std": 0.028673537075519562, "rewards/reward_func/mean": -0.03500000014901161, "rewards/reward_func/std": 0.03999999910593033, "sampling/importance_sampling_ratio/max": 1.5887080430984497, "sampling/importance_sampling_ratio/mean": 0.9801490306854248, "sampling/importance_sampling_ratio/min": 0.5287134647369385, "sampling/sampling_logp_difference/max": 0.49480628967285156, "sampling/sampling_logp_difference/mean": 0.025246813893318176, "step": 237, "step_time": 90.44008958002087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.3520908057689667, "epoch": 0.476, "frac_reward_zero_std": 0.0, "grad_norm": 1.0922024250030518, "kl": 0.031959474086761475, "learning_rate": 4.4588983067017255e-06, "loss": 0.1783, "num_tokens": 1328212.0, "reward": 0.2212499976158142, "reward_std": 0.5126502513885498, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.4746558964252472, "sampling/importance_sampling_ratio/max": 1.6246837377548218, "sampling/importance_sampling_ratio/mean": 0.8979704976081848, "sampling/importance_sampling_ratio/min": 0.3638645112514496, "sampling/sampling_logp_difference/max": 0.4873514175415039, "sampling/sampling_logp_difference/mean": 0.028458524495363235, "step": 238, "step_time": 71.68060715598403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3871595859527588, "epoch": 0.478, "frac_reward_zero_std": 0.0, "grad_norm": 1.2288389205932617, "kl": 0.01771704852581024, "learning_rate": 4.4538573073752365e-06, "loss": 0.0403, "num_tokens": 1333316.0, "reward": 0.054999999701976776, "reward_std": 0.29602476954460144, "rewards/reward_func/mean": 0.054999999701976776, "rewards/reward_func/std": 0.36570870876312256, "sampling/importance_sampling_ratio/max": 1.5434668064117432, "sampling/importance_sampling_ratio/mean": 0.9942148923873901, "sampling/importance_sampling_ratio/min": 0.48395439982414246, "sampling/sampling_logp_difference/max": 0.6915938854217529, "sampling/sampling_logp_difference/mean": 0.035692013800144196, "step": 239, "step_time": 70.68868763197679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.31809288263320923, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 1.0677132606506348, "kl": 0.011697100475430489, "learning_rate": 4.448795813010142e-06, "loss": -0.1253, "num_tokens": 1338733.0, "reward": 0.36125001311302185, "reward_std": 0.5394142866134644, "rewards/reward_func/mean": 0.36125001311302185, "rewards/reward_func/std": 0.5180302262306213, "sampling/importance_sampling_ratio/max": 1.8030056953430176, "sampling/importance_sampling_ratio/mean": 1.1689889430999756, "sampling/importance_sampling_ratio/min": 0.7900420427322388, "sampling/sampling_logp_difference/max": 0.3128845691680908, "sampling/sampling_logp_difference/mean": 0.021864818409085274, "step": 240, "step_time": 66.84894346201327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.29514139890670776, "epoch": 0.482, "frac_reward_zero_std": 0.0, "grad_norm": 1.0881181955337524, "kl": 0.02634306624531746, "learning_rate": 4.443713876699124e-06, "loss": -0.0806, "num_tokens": 1344418.0, "reward": 0.07999999821186066, "reward_std": 0.25690117478370667, "rewards/reward_func/mean": 0.07999999821186066, "rewards/reward_func/std": 0.33342379331588745, "sampling/importance_sampling_ratio/max": 1.363787055015564, "sampling/importance_sampling_ratio/mean": 0.8313639163970947, "sampling/importance_sampling_ratio/min": 0.3071029484272003, "sampling/sampling_logp_difference/max": 0.6727430820465088, "sampling/sampling_logp_difference/mean": 0.02636832371354103, "step": 241, "step_time": 81.16167590999976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.36427628993988037, "epoch": 0.484, "frac_reward_zero_std": 0.0, "grad_norm": 1.100838541984558, "kl": 0.041223861277103424, "learning_rate": 4.438611551749288e-06, "loss": -0.2757, "num_tokens": 1350588.0, "reward": 0.5874999761581421, "reward_std": 0.5468531847000122, "rewards/reward_func/mean": 0.5874999761581421, "rewards/reward_func/std": 0.5267623662948608, "sampling/importance_sampling_ratio/max": 2.2273752689361572, "sampling/importance_sampling_ratio/mean": 1.0134867429733276, "sampling/importance_sampling_ratio/min": 0.3902888894081116, "sampling/sampling_logp_difference/max": 0.5989378690719604, "sampling/sampling_logp_difference/mean": 0.02611200511455536, "step": 242, "step_time": 67.87047297498793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 44.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35488879680633545, "epoch": 0.486, "frac_reward_zero_std": 0.0, "grad_norm": 1.331992268562317, "kl": 0.044845033437013626, "learning_rate": 4.4334888916816096e-06, "loss": -0.0932, "num_tokens": 1355918.0, "reward": 0.20125000178813934, "reward_std": 0.5272895097732544, "rewards/reward_func/mean": 0.20125000178813934, "rewards/reward_func/std": 0.4884212911128998, "sampling/importance_sampling_ratio/max": 1.0778487920761108, "sampling/importance_sampling_ratio/mean": 0.9069632887840271, "sampling/importance_sampling_ratio/min": 0.3717224597930908, "sampling/sampling_logp_difference/max": 0.729764461517334, "sampling/sampling_logp_difference/mean": 0.026907198131084442, "step": 243, "step_time": 67.90137365201372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 44.125, "completions/mean_terminated_length": 44.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3981369733810425, "epoch": 0.488, "frac_reward_zero_std": 0.0, "grad_norm": 1.1184395551681519, "kl": 0.015510768629610538, "learning_rate": 4.42834595023037e-06, "loss": -0.0087, "num_tokens": 1360516.0, "reward": 0.5824999809265137, "reward_std": 0.5755907893180847, "rewards/reward_func/mean": 0.5824999809265137, "rewards/reward_func/std": 0.5471158027648926, "sampling/importance_sampling_ratio/max": 1.1929256916046143, "sampling/importance_sampling_ratio/mean": 0.7005432844161987, "sampling/importance_sampling_ratio/min": 0.47305941581726074, "sampling/sampling_logp_difference/max": 0.354036808013916, "sampling/sampling_logp_difference/mean": 0.024651892483234406, "step": 244, "step_time": 61.913708773994585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.31958675384521484, "epoch": 0.49, "frac_reward_zero_std": 0.0, "grad_norm": 1.1137831211090088, "kl": 0.024688197299838066, "learning_rate": 4.423182781342589e-06, "loss": -0.0889, "num_tokens": 1365727.0, "reward": 0.32625001668930054, "reward_std": 0.5717824697494507, "rewards/reward_func/mean": 0.32625001668930054, "rewards/reward_func/std": 0.5434266328811646, "sampling/importance_sampling_ratio/max": 1.4841914176940918, "sampling/importance_sampling_ratio/mean": 0.8029133081436157, "sampling/importance_sampling_ratio/min": 0.39716988801956177, "sampling/sampling_logp_difference/max": 0.5414783358573914, "sampling/sampling_logp_difference/mean": 0.02428363636136055, "step": 245, "step_time": 68.99983911900199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.33968907594680786, "epoch": 0.492, "frac_reward_zero_std": 0.0, "grad_norm": 1.2323132753372192, "kl": 0.01886364072561264, "learning_rate": 4.417999439177465e-06, "loss": 0.2441, "num_tokens": 1371605.0, "reward": 0.08750000596046448, "reward_std": 0.2773665487766266, "rewards/reward_func/mean": 0.08750000596046448, "rewards/reward_func/std": 0.36577707529067993, "sampling/importance_sampling_ratio/max": 1.7966817617416382, "sampling/importance_sampling_ratio/mean": 0.9920728802680969, "sampling/importance_sampling_ratio/min": 0.46240681409835815, "sampling/sampling_logp_difference/max": 0.3581950068473816, "sampling/sampling_logp_difference/mean": 0.02110222354531288, "step": 246, "step_time": 86.06116112999734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3882921040058136, "epoch": 0.494, "frac_reward_zero_std": 0.0, "grad_norm": 1.0457669496536255, "kl": 0.0466264933347702, "learning_rate": 4.412795978105807e-06, "loss": 0.0479, "num_tokens": 1377108.0, "reward": 0.08125000447034836, "reward_std": 0.290319561958313, "rewards/reward_func/mean": 0.08125000447034836, "rewards/reward_func/std": 0.3741442859172821, "sampling/importance_sampling_ratio/max": 1.3163291215896606, "sampling/importance_sampling_ratio/mean": 0.8274441957473755, "sampling/importance_sampling_ratio/min": 0.5468934178352356, "sampling/sampling_logp_difference/max": 0.3266195058822632, "sampling/sampling_logp_difference/mean": 0.023234577849507332, "step": 247, "step_time": 69.2718325239839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3644499182701111, "epoch": 0.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.8662134408950806, "kl": 0.028522610664367676, "learning_rate": 4.407572452709459e-06, "loss": -0.1758, "num_tokens": 1382458.0, "reward": 0.3387500047683716, "reward_std": 0.2905788719654083, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.550206184387207, "sampling/importance_sampling_ratio/max": 1.9892619848251343, "sampling/importance_sampling_ratio/mean": 0.994696319103241, "sampling/importance_sampling_ratio/min": 0.32547527551651, "sampling/sampling_logp_difference/max": 0.5737671852111816, "sampling/sampling_logp_difference/mean": 0.029661521315574646, "step": 248, "step_time": 55.92289188998984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.340048223733902, "epoch": 0.498, "frac_reward_zero_std": 0.0, "grad_norm": 1.2371996641159058, "kl": 0.014844512566924095, "learning_rate": 4.402328917780728e-06, "loss": 0.1829, "num_tokens": 1387909.0, "reward": 0.32249999046325684, "reward_std": 0.5570697784423828, "rewards/reward_func/mean": 0.32249999046325684, "rewards/reward_func/std": 0.531970739364624, "sampling/importance_sampling_ratio/max": 1.771705985069275, "sampling/importance_sampling_ratio/mean": 1.064762830734253, "sampling/importance_sampling_ratio/min": 0.49545010924339294, "sampling/sampling_logp_difference/max": 0.531287670135498, "sampling/sampling_logp_difference/mean": 0.022021599113941193, "step": 249, "step_time": 72.32509338197997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3873959183692932, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.9252265691757202, "kl": 0.01229300070554018, "learning_rate": 4.397065428321818e-06, "loss": 0.0921, "num_tokens": 1393363.0, "reward": 0.34375, "reward_std": 0.5575473308563232, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5310620069503784, "sampling/importance_sampling_ratio/max": 1.7814934253692627, "sampling/importance_sampling_ratio/mean": 1.0325000286102295, "sampling/importance_sampling_ratio/min": 0.5736287832260132, "sampling/sampling_logp_difference/max": 0.45818281173706055, "sampling/sampling_logp_difference/mean": 0.026250842958688736, "step": 250, "step_time": 90.09270300000207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3573990762233734, "epoch": 0.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.9304350018501282, "kl": 0.01920994371175766, "learning_rate": 4.391782039544239e-06, "loss": 0.1493, "num_tokens": 1399112.0, "reward": 0.3375000059604645, "reward_std": 0.5359517931938171, "rewards/reward_func/mean": 0.3375000059604645, "rewards/reward_func/std": 0.5219400525093079, "sampling/importance_sampling_ratio/max": 1.4142625331878662, "sampling/importance_sampling_ratio/mean": 0.8782503604888916, "sampling/importance_sampling_ratio/min": 0.3311513364315033, "sampling/sampling_logp_difference/max": 0.5579543113708496, "sampling/sampling_logp_difference/mean": 0.024326374754309654, "step": 251, "step_time": 80.91022923600394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3160034418106079, "epoch": 0.504, "frac_reward_zero_std": 0.0, "grad_norm": 2.0592174530029297, "kl": 0.01498313620686531, "learning_rate": 4.386478806868242e-06, "loss": 0.2131, "num_tokens": 1404423.0, "reward": 0.19875001907348633, "reward_std": 0.3156750202178955, "rewards/reward_func/mean": 0.19875001907348633, "rewards/reward_func/std": 0.4896481931209564, "sampling/importance_sampling_ratio/max": 2.2778773307800293, "sampling/importance_sampling_ratio/mean": 1.1893842220306396, "sampling/importance_sampling_ratio/min": 0.4248703420162201, "sampling/sampling_logp_difference/max": 0.31923460960388184, "sampling/sampling_logp_difference/mean": 0.021706879138946533, "step": 252, "step_time": 76.23892499000067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3231382369995117, "epoch": 0.506, "frac_reward_zero_std": 0.0, "grad_norm": 1.1245347261428833, "kl": 0.02181754633784294, "learning_rate": 4.381155785922226e-06, "loss": 0.1193, "num_tokens": 1409836.0, "reward": 0.30375000834465027, "reward_std": 0.5851833820343018, "rewards/reward_func/mean": 0.30375000834465027, "rewards/reward_func/std": 0.56360924243927, "sampling/importance_sampling_ratio/max": 2.5759124755859375, "sampling/importance_sampling_ratio/mean": 1.0727500915527344, "sampling/importance_sampling_ratio/min": 0.623710036277771, "sampling/sampling_logp_difference/max": 0.6664900779724121, "sampling/sampling_logp_difference/mean": 0.023033898323774338, "step": 253, "step_time": 63.543558523000684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3466748595237732, "epoch": 0.508, "frac_reward_zero_std": 0.0, "grad_norm": 1.142482042312622, "kl": 0.0223647840321064, "learning_rate": 4.375813032542164e-06, "loss": -0.0771, "num_tokens": 1415411.0, "reward": 0.21000000834465027, "reward_std": 0.3328624665737152, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.48594531416893005, "sampling/importance_sampling_ratio/max": 2.0455007553100586, "sampling/importance_sampling_ratio/mean": 1.0768548250198364, "sampling/importance_sampling_ratio/min": 0.49030447006225586, "sampling/sampling_logp_difference/max": 0.5383121967315674, "sampling/sampling_logp_difference/mean": 0.03029092587530613, "step": 254, "step_time": 89.68648639999446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3746393322944641, "epoch": 0.51, "frac_reward_zero_std": 0.0, "grad_norm": 2.6275901794433594, "kl": 0.032982099801301956, "learning_rate": 4.37045060277101e-06, "loss": -0.3207, "num_tokens": 1420829.0, "reward": 0.07375000417232513, "reward_std": 0.2716521620750427, "rewards/reward_func/mean": 0.07375000417232513, "rewards/reward_func/std": 0.3556457757949829, "sampling/importance_sampling_ratio/max": 2.175576686859131, "sampling/importance_sampling_ratio/mean": 1.019911527633667, "sampling/importance_sampling_ratio/min": 0.40404126048088074, "sampling/sampling_logp_difference/max": 0.3250246047973633, "sampling/sampling_logp_difference/mean": 0.024091674014925957, "step": 255, "step_time": 82.5458397520124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.3585067689418793, "epoch": 0.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.829698383808136, "kl": 0.021870668977499008, "learning_rate": 4.365068552858116e-06, "loss": 0.0817, "num_tokens": 1426845.0, "reward": 0.1887499988079071, "reward_std": 0.5287714004516602, "rewards/reward_func/mean": 0.1887499988079071, "rewards/reward_func/std": 0.4895898401737213, "sampling/importance_sampling_ratio/max": 1.7392264604568481, "sampling/importance_sampling_ratio/mean": 0.691516637802124, "sampling/importance_sampling_ratio/min": 0.22693133354187012, "sampling/sampling_logp_difference/max": 0.8031024932861328, "sampling/sampling_logp_difference/mean": 0.028423123061656952, "step": 256, "step_time": 77.55902498602518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3533346652984619, "epoch": 0.514, "frac_reward_zero_std": 0.0, "grad_norm": 1.0792738199234009, "kl": 0.01500007789582014, "learning_rate": 4.359666939258637e-06, "loss": -0.1518, "num_tokens": 1432532.0, "reward": 0.07000000774860382, "reward_std": 0.2898591160774231, "rewards/reward_func/mean": 0.07000000774860382, "rewards/reward_func/std": 0.38045087456703186, "sampling/importance_sampling_ratio/max": 1.7748743295669556, "sampling/importance_sampling_ratio/mean": 1.000688910484314, "sampling/importance_sampling_ratio/min": 0.3758687973022461, "sampling/sampling_logp_difference/max": 0.6241648197174072, "sampling/sampling_logp_difference/mean": 0.028312578797340393, "step": 257, "step_time": 73.71465296699898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.36250796914100647, "epoch": 0.516, "frac_reward_zero_std": 0.0, "grad_norm": 1.3215895891189575, "kl": 0.020508520305156708, "learning_rate": 4.354245818632944e-06, "loss": -0.2258, "num_tokens": 1438131.0, "reward": -0.0949999988079071, "reward_std": 0.0752500668168068, "rewards/reward_func/mean": -0.0949999988079071, "rewards/reward_func/std": 0.07559289783239365, "sampling/importance_sampling_ratio/max": 2.191699743270874, "sampling/importance_sampling_ratio/mean": 1.1482765674591064, "sampling/importance_sampling_ratio/min": 0.5960127115249634, "sampling/sampling_logp_difference/max": 0.35130882263183594, "sampling/sampling_logp_difference/mean": 0.02419322356581688, "step": 258, "step_time": 78.94406015900313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3972855806350708, "epoch": 0.518, "frac_reward_zero_std": 0.0, "grad_norm": 1.4929661750793457, "kl": 0.021859250962734222, "learning_rate": 4.348805247846027e-06, "loss": 0.1502, "num_tokens": 1444118.0, "reward": 0.2149999886751175, "reward_std": 0.31619954109191895, "rewards/reward_func/mean": 0.2149999886751175, "rewards/reward_func/std": 0.4730146527290344, "sampling/importance_sampling_ratio/max": 1.930320382118225, "sampling/importance_sampling_ratio/mean": 1.0938146114349365, "sampling/importance_sampling_ratio/min": 0.44256508350372314, "sampling/sampling_logp_difference/max": 0.29829633235931396, "sampling/sampling_logp_difference/mean": 0.027022160589694977, "step": 259, "step_time": 78.32750187598867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35829824209213257, "epoch": 0.52, "frac_reward_zero_std": 0.0, "grad_norm": 1.167080044746399, "kl": 0.013501507230103016, "learning_rate": 4.343345283966901e-06, "loss": -0.1633, "num_tokens": 1449057.0, "reward": 0.4612500071525574, "reward_std": 0.6074321269989014, "rewards/reward_func/mean": 0.4612500071525574, "rewards/reward_func/std": 0.5628102421760559, "sampling/importance_sampling_ratio/max": 1.504439115524292, "sampling/importance_sampling_ratio/mean": 1.075119972229004, "sampling/importance_sampling_ratio/min": 0.27312949299812317, "sampling/sampling_logp_difference/max": 0.33127808570861816, "sampling/sampling_logp_difference/mean": 0.026082661002874374, "step": 260, "step_time": 60.16271702598897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3394550681114197, "epoch": 0.522, "frac_reward_zero_std": 0.0, "grad_norm": 1.2084001302719116, "kl": 0.023300688713788986, "learning_rate": 4.337865984268002e-06, "loss": -0.0475, "num_tokens": 1454514.0, "reward": 0.21000000834465027, "reward_std": 0.5280653238296509, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.48890548944473267, "sampling/importance_sampling_ratio/max": 1.85581374168396, "sampling/importance_sampling_ratio/mean": 1.034727692604065, "sampling/importance_sampling_ratio/min": 0.5134819149971008, "sampling/sampling_logp_difference/max": 0.6527895927429199, "sampling/sampling_logp_difference/mean": 0.028075508773326874, "step": 261, "step_time": 65.41974141600076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.3510357737541199, "epoch": 0.524, "frac_reward_zero_std": 0.0, "grad_norm": 2.0556254386901855, "kl": 0.016779771074652672, "learning_rate": 4.33236740622459e-06, "loss": -0.177, "num_tokens": 1460819.0, "reward": -0.06750000268220901, "reward_std": 0.056236058473587036, "rewards/reward_func/mean": -0.06750000268220901, "rewards/reward_func/std": 0.05548487976193428, "sampling/importance_sampling_ratio/max": 2.7717020511627197, "sampling/importance_sampling_ratio/mean": 1.2683167457580566, "sampling/importance_sampling_ratio/min": 0.6609295010566711, "sampling/sampling_logp_difference/max": 0.4664306640625, "sampling/sampling_logp_difference/mean": 0.024730544537305832, "step": 262, "step_time": 89.55936830199789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3273940086364746, "epoch": 0.526, "frac_reward_zero_std": 0.0, "grad_norm": 1.300922155380249, "kl": 0.0253826305270195, "learning_rate": 4.326849607514149e-06, "loss": -0.1908, "num_tokens": 1466312.0, "reward": 0.32750001549720764, "reward_std": 0.5473343133926392, "rewards/reward_func/mean": 0.32750001549720764, "rewards/reward_func/std": 0.5286033153533936, "sampling/importance_sampling_ratio/max": 1.702580213546753, "sampling/importance_sampling_ratio/mean": 1.12638521194458, "sampling/importance_sampling_ratio/min": 0.5338081121444702, "sampling/sampling_logp_difference/max": 0.4523458480834961, "sampling/sampling_logp_difference/mean": 0.024661045521497726, "step": 263, "step_time": 67.0695453399967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3104347586631775, "epoch": 0.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.8551791906356812, "kl": 0.013168178498744965, "learning_rate": 4.321312646015775e-06, "loss": -0.0571, "num_tokens": 1471010.0, "reward": 0.3400000035762787, "reward_std": 0.5720411539077759, "rewards/reward_func/mean": 0.3400000035762787, "rewards/reward_func/std": 0.5474616289138794, "sampling/importance_sampling_ratio/max": 1.145720362663269, "sampling/importance_sampling_ratio/mean": 0.6736248135566711, "sampling/importance_sampling_ratio/min": 0.32681626081466675, "sampling/sampling_logp_difference/max": 0.506934404373169, "sampling/sampling_logp_difference/mean": 0.022311819717288017, "step": 264, "step_time": 69.11695815299754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.36542245745658875, "epoch": 0.53, "frac_reward_zero_std": 0.0, "grad_norm": 0.7678577899932861, "kl": 0.01349552534520626, "learning_rate": 4.315756579809575e-06, "loss": -0.0131, "num_tokens": 1475783.0, "reward": 0.45250001549720764, "reward_std": 0.5276904106140137, "rewards/reward_func/mean": 0.45250001549720764, "rewards/reward_func/std": 0.5621070265769958, "sampling/importance_sampling_ratio/max": 1.4794089794158936, "sampling/importance_sampling_ratio/mean": 0.8411662578582764, "sampling/importance_sampling_ratio/min": 0.2986375391483307, "sampling/sampling_logp_difference/max": 0.3246455192565918, "sampling/sampling_logp_difference/mean": 0.02376371994614601, "step": 265, "step_time": 60.26253752099001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.39109036326408386, "epoch": 0.532, "frac_reward_zero_std": 0.0, "grad_norm": 1.601122260093689, "kl": 0.030363351106643677, "learning_rate": 4.3101814671760546e-06, "loss": 0.1835, "num_tokens": 1480977.0, "reward": 0.19500000774860382, "reward_std": 0.498978853225708, "rewards/reward_func/mean": 0.19500000774860382, "rewards/reward_func/std": 0.46309521794319153, "sampling/importance_sampling_ratio/max": 1.9800269603729248, "sampling/importance_sampling_ratio/mean": 1.074782133102417, "sampling/importance_sampling_ratio/min": 0.28303632140159607, "sampling/sampling_logp_difference/max": 0.3251028060913086, "sampling/sampling_logp_difference/mean": 0.03300042822957039, "step": 266, "step_time": 73.40508937300183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.31551459431648254, "epoch": 0.534, "frac_reward_zero_std": 0.0, "grad_norm": 1.2413638830184937, "kl": 0.046541355550289154, "learning_rate": 4.304587366595506e-06, "loss": 0.0603, "num_tokens": 1486647.0, "reward": -0.03125, "reward_std": 0.03729227930307388, "rewards/reward_func/mean": -0.03125, "rewards/reward_func/std": 0.035632047802209854, "sampling/importance_sampling_ratio/max": 1.5068446397781372, "sampling/importance_sampling_ratio/mean": 1.0979515314102173, "sampling/importance_sampling_ratio/min": 0.7618433237075806, "sampling/sampling_logp_difference/max": 0.4640469551086426, "sampling/sampling_logp_difference/mean": 0.021073922514915466, "step": 267, "step_time": 78.26023236100446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3318823575973511, "epoch": 0.536, "frac_reward_zero_std": 0.0, "grad_norm": 1.2611191272735596, "kl": 0.04281270503997803, "learning_rate": 4.298974336747397e-06, "loss": 0.1527, "num_tokens": 1491435.0, "reward": 0.4449999928474426, "reward_std": 0.5528978109359741, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.5943784117698669, "sampling/importance_sampling_ratio/max": 2.169663190841675, "sampling/importance_sampling_ratio/mean": 1.2059040069580078, "sampling/importance_sampling_ratio/min": 0.5681382417678833, "sampling/sampling_logp_difference/max": 0.44361448287963867, "sampling/sampling_logp_difference/mean": 0.02404342032968998, "step": 268, "step_time": 42.1042278399982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3449662923812866, "epoch": 0.538, "frac_reward_zero_std": 0.0, "grad_norm": 1.4394900798797607, "kl": 0.04537257179617882, "learning_rate": 4.2933424365097565e-06, "loss": -0.0417, "num_tokens": 1497478.0, "reward": 0.23375000059604645, "reward_std": 0.3076711595058441, "rewards/reward_func/mean": 0.23375000059604645, "rewards/reward_func/std": 0.4731939435005188, "sampling/importance_sampling_ratio/max": 1.6045145988464355, "sampling/importance_sampling_ratio/mean": 0.9230844974517822, "sampling/importance_sampling_ratio/min": 0.4231981933116913, "sampling/sampling_logp_difference/max": 0.4870121479034424, "sampling/sampling_logp_difference/mean": 0.027318792417645454, "step": 269, "step_time": 78.79464126299717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3510008454322815, "epoch": 0.54, "frac_reward_zero_std": 0.0, "grad_norm": 1.2418956756591797, "kl": 0.018656443804502487, "learning_rate": 4.287691724958551e-06, "loss": 0.0041, "num_tokens": 1502743.0, "reward": 0.05625000223517418, "reward_std": 0.30281367897987366, "rewards/reward_func/mean": 0.05625000223517418, "rewards/reward_func/std": 0.38615089654922485, "sampling/importance_sampling_ratio/max": 1.43633234500885, "sampling/importance_sampling_ratio/mean": 0.9880182147026062, "sampling/importance_sampling_ratio/min": 0.5201124548912048, "sampling/sampling_logp_difference/max": 0.4504268169403076, "sampling/sampling_logp_difference/mean": 0.025521527975797653, "step": 270, "step_time": 61.80690009400132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3246690034866333, "epoch": 0.542, "frac_reward_zero_std": 0.0, "grad_norm": 1.149324893951416, "kl": 0.07095597684383392, "learning_rate": 4.282022261367074e-06, "loss": 0.1048, "num_tokens": 1508366.0, "reward": 0.33375000953674316, "reward_std": 0.5544325113296509, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5303351283073425, "sampling/importance_sampling_ratio/max": 2.206528425216675, "sampling/importance_sampling_ratio/mean": 0.8827699422836304, "sampling/importance_sampling_ratio/min": 0.25518810749053955, "sampling/sampling_logp_difference/max": 1.0000684261322021, "sampling/sampling_logp_difference/mean": 0.03170555830001831, "step": 271, "step_time": 75.03205436599092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.30604857206344604, "epoch": 0.544, "frac_reward_zero_std": 0.0, "grad_norm": 1.9059810638427734, "kl": 0.059660904109478, "learning_rate": 4.276334105205312e-06, "loss": -0.2118, "num_tokens": 1513438.0, "reward": 0.06875000149011612, "reward_std": 0.2786497473716736, "rewards/reward_func/mean": 0.06875000149011612, "rewards/reward_func/std": 0.36317792534828186, "sampling/importance_sampling_ratio/max": 2.2432162761688232, "sampling/importance_sampling_ratio/mean": 0.9688401222229004, "sampling/importance_sampling_ratio/min": 0.4208078682422638, "sampling/sampling_logp_difference/max": 0.7957940101623535, "sampling/sampling_logp_difference/mean": 0.025905363261699677, "step": 272, "step_time": 70.6048888520163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.42150887846946716, "epoch": 0.546, "frac_reward_zero_std": 0.0, "grad_norm": 1.848087191581726, "kl": 0.04411306977272034, "learning_rate": 4.270627316139333e-06, "loss": 0.0721, "num_tokens": 1519207.0, "reward": 0.3174999952316284, "reward_std": 0.3132410943508148, "rewards/reward_func/mean": 0.3174999952316284, "rewards/reward_func/std": 0.5588700175285339, "sampling/importance_sampling_ratio/max": 2.6300036907196045, "sampling/importance_sampling_ratio/mean": 1.3033478260040283, "sampling/importance_sampling_ratio/min": 0.6930631399154663, "sampling/sampling_logp_difference/max": 0.4798305034637451, "sampling/sampling_logp_difference/mean": 0.027141718193888664, "step": 273, "step_time": 58.65063955899677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.375, "completions/mean_terminated_length": 45.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.346945196390152, "epoch": 0.548, "frac_reward_zero_std": 0.0, "grad_norm": 1.3174351453781128, "kl": 0.03675675392150879, "learning_rate": 4.264901954030655e-06, "loss": 0.2718, "num_tokens": 1524595.0, "reward": 0.3349999785423279, "reward_std": 0.5533304214477539, "rewards/reward_func/mean": 0.3349999785423279, "rewards/reward_func/std": 0.5276091694831848, "sampling/importance_sampling_ratio/max": 1.823628306388855, "sampling/importance_sampling_ratio/mean": 1.3205350637435913, "sampling/importance_sampling_ratio/min": 0.5515703558921814, "sampling/sampling_logp_difference/max": 0.6266647577285767, "sampling/sampling_logp_difference/mean": 0.027004873380064964, "step": 274, "step_time": 77.85147952000261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 46.875, "completions/mean_terminated_length": 46.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.34322333335876465, "epoch": 0.55, "frac_reward_zero_std": 0.0, "grad_norm": 1.5239591598510742, "kl": 0.03387777507305145, "learning_rate": 4.259158078935616e-06, "loss": 0.0834, "num_tokens": 1530599.0, "reward": 0.6112500429153442, "reward_std": 0.5479111671447754, "rewards/reward_func/mean": 0.6112500429153442, "rewards/reward_func/std": 0.5258309841156006, "sampling/importance_sampling_ratio/max": 1.549354910850525, "sampling/importance_sampling_ratio/mean": 0.9935581088066101, "sampling/importance_sampling_ratio/min": 0.30347806215286255, "sampling/sampling_logp_difference/max": 0.5402736663818359, "sampling/sampling_logp_difference/mean": 0.026231858879327774, "step": 275, "step_time": 44.57912472402677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.35677075386047363, "epoch": 0.552, "frac_reward_zero_std": 0.0, "grad_norm": 1.31052827835083, "kl": 0.03178555518388748, "learning_rate": 4.2533957511047485e-06, "loss": -0.2195, "num_tokens": 1536340.0, "reward": 0.33500000834465027, "reward_std": 0.5740325450897217, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.551310122013092, "sampling/importance_sampling_ratio/max": 1.479032039642334, "sampling/importance_sampling_ratio/mean": 1.0965967178344727, "sampling/importance_sampling_ratio/min": 0.65904301404953, "sampling/sampling_logp_difference/max": 0.5930355787277222, "sampling/sampling_logp_difference/mean": 0.027675746008753777, "step": 276, "step_time": 56.27619910798967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3377889394760132, "epoch": 0.554, "frac_reward_zero_std": 0.0, "grad_norm": 0.999190628528595, "kl": 0.032812558114528656, "learning_rate": 4.247615030982144e-06, "loss": 0.0927, "num_tokens": 1541902.0, "reward": 0.4775000214576721, "reward_std": 0.4942038655281067, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.527304470539093, "sampling/importance_sampling_ratio/max": 1.5036600828170776, "sampling/importance_sampling_ratio/mean": 0.9121675491333008, "sampling/importance_sampling_ratio/min": 0.3932625651359558, "sampling/sampling_logp_difference/max": 0.6314131021499634, "sampling/sampling_logp_difference/mean": 0.027674881741404533, "step": 277, "step_time": 66.32918151500053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35427534580230713, "epoch": 0.556, "frac_reward_zero_std": 0.0, "grad_norm": 1.1933726072311401, "kl": 0.06276769191026688, "learning_rate": 4.241815979204822e-06, "loss": 0.1654, "num_tokens": 1548032.0, "reward": 0.19750000536441803, "reward_std": 0.32018035650253296, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.48417091369628906, "sampling/importance_sampling_ratio/max": 1.7151527404785156, "sampling/importance_sampling_ratio/mean": 1.0098499059677124, "sampling/importance_sampling_ratio/min": 0.11691775172948837, "sampling/sampling_logp_difference/max": 1.2129077911376953, "sampling/sampling_logp_difference/mean": 0.02612270414829254, "step": 278, "step_time": 88.54080397897633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3449207544326782, "epoch": 0.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.924571692943573, "kl": 0.040953975170850754, "learning_rate": 4.235998656602091e-06, "loss": 0.1445, "num_tokens": 1553989.0, "reward": -0.06750000268220901, "reward_std": 0.03040887415409088, "rewards/reward_func/mean": -0.06750000268220901, "rewards/reward_func/std": 0.051199886947870255, "sampling/importance_sampling_ratio/max": 0.9909558892250061, "sampling/importance_sampling_ratio/mean": 0.7079716920852661, "sampling/importance_sampling_ratio/min": 0.47882190346717834, "sampling/sampling_logp_difference/max": 0.6735103130340576, "sampling/sampling_logp_difference/mean": 0.03071964532136917, "step": 279, "step_time": 82.45593494997593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.38967427611351013, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 1.2218159437179565, "kl": 0.027102379128336906, "learning_rate": 4.230163124194913e-06, "loss": 0.0018, "num_tokens": 1559709.0, "reward": 0.08249999582767487, "reward_std": 0.27100443840026855, "rewards/reward_func/mean": 0.08249999582767487, "rewards/reward_func/std": 0.3715507984161377, "sampling/importance_sampling_ratio/max": 1.6555224657058716, "sampling/importance_sampling_ratio/mean": 1.070943832397461, "sampling/importance_sampling_ratio/min": 0.5340960025787354, "sampling/sampling_logp_difference/max": 0.4693126082420349, "sampling/sampling_logp_difference/mean": 0.027980361133813858, "step": 280, "step_time": 72.15682938401005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.29565370082855225, "epoch": 0.562, "frac_reward_zero_std": 0.0, "grad_norm": 1.19646418094635, "kl": 0.03536316379904747, "learning_rate": 4.224309443195261e-06, "loss": -0.2295, "num_tokens": 1565505.0, "reward": 0.48250001668930054, "reward_std": 0.5834507346153259, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5402050018310547, "sampling/importance_sampling_ratio/max": 1.867733359336853, "sampling/importance_sampling_ratio/mean": 1.0055427551269531, "sampling/importance_sampling_ratio/min": 0.4048961102962494, "sampling/sampling_logp_difference/max": 0.6614785194396973, "sampling/sampling_logp_difference/mean": 0.027097908779978752, "step": 281, "step_time": 51.57847329697688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 56.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.35139578580856323, "epoch": 0.564, "frac_reward_zero_std": 0.0, "grad_norm": 1.086079478263855, "kl": 0.022778205573558807, "learning_rate": 4.218437675005479e-06, "loss": -0.2493, "num_tokens": 1571510.0, "reward": 0.5787500143051147, "reward_std": 0.5751041173934937, "rewards/reward_func/mean": 0.5787500143051147, "rewards/reward_func/std": 0.5546797513961792, "sampling/importance_sampling_ratio/max": 1.7726209163665771, "sampling/importance_sampling_ratio/mean": 1.1253960132598877, "sampling/importance_sampling_ratio/min": 0.4784102737903595, "sampling/sampling_logp_difference/max": 0.4526965618133545, "sampling/sampling_logp_difference/mean": 0.02535804733633995, "step": 282, "step_time": 64.53625944399391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.398438036441803, "epoch": 0.566, "frac_reward_zero_std": 0.0, "grad_norm": 1.212346076965332, "kl": 0.037971317768096924, "learning_rate": 4.212547881217637e-06, "loss": -0.064, "num_tokens": 1577608.0, "reward": 0.2212499976158142, "reward_std": 0.29377394914627075, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.4595475196838379, "sampling/importance_sampling_ratio/max": 1.3747642040252686, "sampling/importance_sampling_ratio/mean": 0.9095951914787292, "sampling/importance_sampling_ratio/min": 0.45244070887565613, "sampling/sampling_logp_difference/max": 0.7049179077148438, "sampling/sampling_logp_difference/mean": 0.030841922387480736, "step": 283, "step_time": 84.87170067700208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.36814165115356445, "epoch": 0.568, "frac_reward_zero_std": 0.0, "grad_norm": 1.1691949367523193, "kl": 0.036575593054294586, "learning_rate": 4.206640123612885e-06, "loss": 0.0775, "num_tokens": 1582975.0, "reward": 0.7237499952316284, "reward_std": 0.5162468552589417, "rewards/reward_func/mean": 0.7237499952316284, "rewards/reward_func/std": 0.4783584177494049, "sampling/importance_sampling_ratio/max": 1.1772783994674683, "sampling/importance_sampling_ratio/mean": 0.7350926399230957, "sampling/importance_sampling_ratio/min": 0.24923691153526306, "sampling/sampling_logp_difference/max": 0.8092962503433228, "sampling/sampling_logp_difference/mean": 0.03558259829878807, "step": 284, "step_time": 59.08952103398042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.375, "completions/mean_terminated_length": 40.375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.31116950511932373, "epoch": 0.57, "frac_reward_zero_std": 0.0, "grad_norm": 1.1499625444412231, "kl": 0.05957644805312157, "learning_rate": 4.2007144641608035e-06, "loss": 0.2114, "num_tokens": 1588426.0, "reward": 0.20625001192092896, "reward_std": 0.29973241686820984, "rewards/reward_func/mean": 0.20625001192092896, "rewards/reward_func/std": 0.4607428014278412, "sampling/importance_sampling_ratio/max": 1.3515949249267578, "sampling/importance_sampling_ratio/mean": 0.9128226041793823, "sampling/importance_sampling_ratio/min": 0.24216710031032562, "sampling/sampling_logp_difference/max": 0.3609771728515625, "sampling/sampling_logp_difference/mean": 0.025414273142814636, "step": 285, "step_time": 69.37833641498582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 41.875, "completions/mean_terminated_length": 41.875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.40447139739990234, "epoch": 0.572, "frac_reward_zero_std": 0.0, "grad_norm": 2.3697779178619385, "kl": 0.02398294396698475, "learning_rate": 4.194770965018758e-06, "loss": 0.2327, "num_tokens": 1594255.0, "reward": 0.4737499952316284, "reward_std": 0.5166642665863037, "rewards/reward_func/mean": 0.4737499952316284, "rewards/reward_func/std": 0.5499594211578369, "sampling/importance_sampling_ratio/max": 1.7312536239624023, "sampling/importance_sampling_ratio/mean": 0.9678086638450623, "sampling/importance_sampling_ratio/min": 0.5032089352607727, "sampling/sampling_logp_difference/max": 0.43535709381103516, "sampling/sampling_logp_difference/mean": 0.030212290585041046, "step": 286, "step_time": 86.86279435199685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35356995463371277, "epoch": 0.574, "frac_reward_zero_std": 0.0, "grad_norm": 1.1606370210647583, "kl": 0.023810427635908127, "learning_rate": 4.188809688531241e-06, "loss": -0.1423, "num_tokens": 1599919.0, "reward": 0.08499999344348907, "reward_std": 0.28996607661247253, "rewards/reward_func/mean": 0.08499999344348907, "rewards/reward_func/std": 0.3730185925960541, "sampling/importance_sampling_ratio/max": 1.213178277015686, "sampling/importance_sampling_ratio/mean": 0.8015030026435852, "sampling/importance_sampling_ratio/min": 0.18578791618347168, "sampling/sampling_logp_difference/max": 0.367124080657959, "sampling/sampling_logp_difference/mean": 0.02456764504313469, "step": 287, "step_time": 86.99813774897484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.37545299530029297, "epoch": 0.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.9484260678291321, "kl": 0.03164836764335632, "learning_rate": 4.182830697229223e-06, "loss": 0.0409, "num_tokens": 1605747.0, "reward": 0.22625000774860382, "reward_std": 0.3063211739063263, "rewards/reward_func/mean": 0.22625000774860382, "rewards/reward_func/std": 0.4631241261959076, "sampling/importance_sampling_ratio/max": 1.3765383958816528, "sampling/importance_sampling_ratio/mean": 0.9212698340415955, "sampling/importance_sampling_ratio/min": 0.42979246377944946, "sampling/sampling_logp_difference/max": 0.46885204315185547, "sampling/sampling_logp_difference/mean": 0.0255972221493721, "step": 288, "step_time": 67.68871720400057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3726646602153778, "epoch": 0.578, "frac_reward_zero_std": 0.0, "grad_norm": 1.2278378009796143, "kl": 0.03291169926524162, "learning_rate": 4.176834053829492e-06, "loss": 0.0774, "num_tokens": 1611005.0, "reward": 0.33249998092651367, "reward_std": 0.5452134609222412, "rewards/reward_func/mean": 0.33249998092651367, "rewards/reward_func/std": 0.5284410715103149, "sampling/importance_sampling_ratio/max": 1.2047574520111084, "sampling/importance_sampling_ratio/mean": 0.9297256469726562, "sampling/importance_sampling_ratio/min": 0.5736981630325317, "sampling/sampling_logp_difference/max": 0.4195805788040161, "sampling/sampling_logp_difference/mean": 0.025271501392126083, "step": 289, "step_time": 57.347708321001846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.368205189704895, "epoch": 0.58, "frac_reward_zero_std": 0.0, "grad_norm": 1.06058931350708, "kl": 0.01727902702987194, "learning_rate": 4.170819821234001e-06, "loss": -0.1661, "num_tokens": 1616685.0, "reward": 0.6000000238418579, "reward_std": 0.5506213903427124, "rewards/reward_func/mean": 0.6000000238418579, "rewards/reward_func/std": 0.5333184599876404, "sampling/importance_sampling_ratio/max": 1.8462737798690796, "sampling/importance_sampling_ratio/mean": 0.8948688507080078, "sampling/importance_sampling_ratio/min": 0.4318339228630066, "sampling/sampling_logp_difference/max": 0.5361829996109009, "sampling/sampling_logp_difference/mean": 0.030598482117056847, "step": 290, "step_time": 63.116088277020026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 45.875, "completions/mean_terminated_length": 45.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3375159502029419, "epoch": 0.582, "frac_reward_zero_std": 0.0, "grad_norm": 1.7271595001220703, "kl": 0.036022864282131195, "learning_rate": 4.164788062529203e-06, "loss": 0.3231, "num_tokens": 1622232.0, "reward": 0.3474999964237213, "reward_std": 0.2686045467853546, "rewards/reward_func/mean": 0.3474999964237213, "rewards/reward_func/std": 0.5296832323074341, "sampling/importance_sampling_ratio/max": 2.602046489715576, "sampling/importance_sampling_ratio/mean": 1.1395049095153809, "sampling/importance_sampling_ratio/min": 0.46707242727279663, "sampling/sampling_logp_difference/max": 0.3359344005584717, "sampling/sampling_logp_difference/mean": 0.025325238704681396, "step": 291, "step_time": 80.50347790899104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.375, "completions/mean_terminated_length": 45.375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.36788409948349, "epoch": 0.584, "frac_reward_zero_std": 0.0, "grad_norm": 1.0641146898269653, "kl": 0.027490928769111633, "learning_rate": 4.158738840985393e-06, "loss": 0.1119, "num_tokens": 1627699.0, "reward": 0.17999999225139618, "reward_std": 0.3215157687664032, "rewards/reward_func/mean": 0.17999999225139618, "rewards/reward_func/std": 0.4744320213794708, "sampling/importance_sampling_ratio/max": 1.5140283107757568, "sampling/importance_sampling_ratio/mean": 0.6938580870628357, "sampling/importance_sampling_ratio/min": 0.20368647575378418, "sampling/sampling_logp_difference/max": 0.8070402145385742, "sampling/sampling_logp_difference/mean": 0.02949894405901432, "step": 292, "step_time": 78.2215075980057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 45.125, "completions/mean_terminated_length": 45.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.4428096115589142, "epoch": 0.586, "frac_reward_zero_std": 0.0, "grad_norm": 1.749643325805664, "kl": 0.04018591344356537, "learning_rate": 4.1526722200560445e-06, "loss": -0.1564, "num_tokens": 1633177.0, "reward": 0.34375, "reward_std": 0.5563769340515137, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5327540040016174, "sampling/importance_sampling_ratio/max": 2.5977683067321777, "sampling/importance_sampling_ratio/mean": 0.9267335534095764, "sampling/importance_sampling_ratio/min": 0.43353283405303955, "sampling/sampling_logp_difference/max": 0.6082069873809814, "sampling/sampling_logp_difference/mean": 0.036718130111694336, "step": 293, "step_time": 72.21064010998816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3257609009742737, "epoch": 0.588, "frac_reward_zero_std": 0.0, "grad_norm": 1.1807507276535034, "kl": 0.030308792367577553, "learning_rate": 4.146588263377137e-06, "loss": 0.0547, "num_tokens": 1638629.0, "reward": 0.5962499976158142, "reward_std": 0.5583738088607788, "rewards/reward_func/mean": 0.5962499976158142, "rewards/reward_func/std": 0.5385679006576538, "sampling/importance_sampling_ratio/max": 1.4324092864990234, "sampling/importance_sampling_ratio/mean": 0.9339421391487122, "sampling/importance_sampling_ratio/min": 0.6571045517921448, "sampling/sampling_logp_difference/max": 0.35495901107788086, "sampling/sampling_logp_difference/mean": 0.021137617528438568, "step": 294, "step_time": 68.63880399399204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.36071956157684326, "epoch": 0.59, "frac_reward_zero_std": 0.0, "grad_norm": 1.4050084352493286, "kl": 0.03706521913409233, "learning_rate": 4.140487034766499e-06, "loss": 0.0768, "num_tokens": 1644795.0, "reward": 0.35249999165534973, "reward_std": 0.5524863600730896, "rewards/reward_func/mean": 0.35249999165534973, "rewards/reward_func/std": 0.5344623327255249, "sampling/importance_sampling_ratio/max": 1.5176059007644653, "sampling/importance_sampling_ratio/mean": 0.9870838522911072, "sampling/importance_sampling_ratio/min": 0.6246324777603149, "sampling/sampling_logp_difference/max": 0.8912210464477539, "sampling/sampling_logp_difference/mean": 0.028878837823867798, "step": 295, "step_time": 57.42247140299878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3431392312049866, "epoch": 0.592, "frac_reward_zero_std": 0.0, "grad_norm": 1.1508780717849731, "kl": 0.03153597190976143, "learning_rate": 4.134368598223132e-06, "loss": 0.1312, "num_tokens": 1650107.0, "reward": 0.3387500047683716, "reward_std": 0.5518091320991516, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5335979461669922, "sampling/importance_sampling_ratio/max": 1.6481400728225708, "sampling/importance_sampling_ratio/mean": 0.934108555316925, "sampling/importance_sampling_ratio/min": 0.31286314129829407, "sampling/sampling_logp_difference/max": 0.3653430938720703, "sampling/sampling_logp_difference/mean": 0.03261955454945564, "step": 296, "step_time": 48.63130289298715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 42.125, "completions/mean_terminated_length": 42.125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.33156031370162964, "epoch": 0.594, "frac_reward_zero_std": 0.0, "grad_norm": 1.4071409702301025, "kl": 0.041910890489816666, "learning_rate": 4.128233017926538e-06, "loss": 0.2664, "num_tokens": 1655436.0, "reward": -0.0637499988079071, "reward_std": 0.03450929373502731, "rewards/reward_func/mean": -0.0637499988079071, "rewards/reward_func/std": 0.04274091124534607, "sampling/importance_sampling_ratio/max": 1.8012773990631104, "sampling/importance_sampling_ratio/mean": 0.9478522539138794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.553492546081543, "sampling/sampling_logp_difference/mean": 0.030245978385210037, "step": 297, "step_time": 74.5723572280258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 43.875, "completions/mean_terminated_length": 43.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3426669239997864, "epoch": 0.596, "frac_reward_zero_std": 0.0, "grad_norm": 1.1105406284332275, "kl": 0.043890222907066345, "learning_rate": 4.1220803582360545e-06, "loss": -0.0848, "num_tokens": 1661032.0, "reward": -0.03999999910593033, "reward_std": 0.03082464262843132, "rewards/reward_func/mean": -0.03999999910593033, "rewards/reward_func/std": 0.029760954901576042, "sampling/importance_sampling_ratio/max": 1.2387455701828003, "sampling/importance_sampling_ratio/mean": 0.9630196690559387, "sampling/importance_sampling_ratio/min": 0.6839993596076965, "sampling/sampling_logp_difference/max": 0.5896548628807068, "sampling/sampling_logp_difference/mean": 0.029966674745082855, "step": 298, "step_time": 70.879077177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.36084866523742676, "epoch": 0.598, "frac_reward_zero_std": 0.0, "grad_norm": 1.1439580917358398, "kl": 0.04164861887693405, "learning_rate": 4.115910683690167e-06, "loss": 0.109, "num_tokens": 1666142.0, "reward": 0.46000000834465027, "reward_std": 0.5153526067733765, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.5593363046646118, "sampling/importance_sampling_ratio/max": 1.1726311445236206, "sampling/importance_sampling_ratio/mean": 0.7731176614761353, "sampling/importance_sampling_ratio/min": 0.44161850214004517, "sampling/sampling_logp_difference/max": 0.5879793167114258, "sampling/sampling_logp_difference/mean": 0.029971588402986526, "step": 299, "step_time": 40.97617705501034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.35997629165649414, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 1.2015734910964966, "kl": 0.051748767495155334, "learning_rate": 4.109724059005844e-06, "loss": -0.1698, "num_tokens": 1671675.0, "reward": 0.19499999284744263, "reward_std": 0.5328658819198608, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.4941948652267456, "sampling/importance_sampling_ratio/max": 1.5676287412643433, "sampling/importance_sampling_ratio/mean": 0.8457791805267334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.410290241241455, "sampling/sampling_logp_difference/mean": 0.03373635932803154, "step": 300, "step_time": 71.877353650023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3280709981918335, "epoch": 0.602, "frac_reward_zero_std": 0.0, "grad_norm": 1.5422002077102661, "kl": 0.04158281534910202, "learning_rate": 4.1035205490778505e-06, "loss": -0.1959, "num_tokens": 1677448.0, "reward": 0.3199999928474426, "reward_std": 0.5629400610923767, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.5402909517288208, "sampling/importance_sampling_ratio/max": 1.9516421556472778, "sampling/importance_sampling_ratio/mean": 1.1000713109970093, "sampling/importance_sampling_ratio/min": 0.3914698660373688, "sampling/sampling_logp_difference/max": 0.4937098026275635, "sampling/sampling_logp_difference/mean": 0.025912173092365265, "step": 301, "step_time": 59.86676025000634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3313376307487488, "epoch": 0.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.997847855091095, "kl": 0.04069218039512634, "learning_rate": 4.09730021897807e-06, "loss": -0.0619, "num_tokens": 1683406.0, "reward": 0.19749999046325684, "reward_std": 0.3082555830478668, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.4607369899749756, "sampling/importance_sampling_ratio/max": 1.2228721380233765, "sampling/importance_sampling_ratio/mean": 0.8224000930786133, "sampling/importance_sampling_ratio/min": 0.42023351788520813, "sampling/sampling_logp_difference/max": 0.5434841513633728, "sampling/sampling_logp_difference/mean": 0.02660995163023472, "step": 302, "step_time": 57.52045150997583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.31421804428100586, "epoch": 0.606, "frac_reward_zero_std": 0.0, "grad_norm": 1.4279636144638062, "kl": 0.07083047926425934, "learning_rate": 4.091063133954821e-06, "loss": 0.2061, "num_tokens": 1689378.0, "reward": 0.19249999523162842, "reward_std": 0.5418117642402649, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.5016757845878601, "sampling/importance_sampling_ratio/max": 2.132955312728882, "sampling/importance_sampling_ratio/mean": 1.1564010381698608, "sampling/importance_sampling_ratio/min": 0.4834826588630676, "sampling/sampling_logp_difference/max": 0.5907609462738037, "sampling/sampling_logp_difference/mean": 0.03361092135310173, "step": 303, "step_time": 67.22909496401553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3458764851093292, "epoch": 0.608, "frac_reward_zero_std": 0.0, "grad_norm": 1.9183087348937988, "kl": 0.04568685591220856, "learning_rate": 4.084809359432175e-06, "loss": -0.0881, "num_tokens": 1694776.0, "reward": 0.09000000357627869, "reward_std": 0.2616836130619049, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.36847177147865295, "sampling/importance_sampling_ratio/max": 1.7175835371017456, "sampling/importance_sampling_ratio/mean": 0.9860607385635376, "sampling/importance_sampling_ratio/min": 0.327860951423645, "sampling/sampling_logp_difference/max": 0.5655612945556641, "sampling/sampling_logp_difference/mean": 0.028499091044068336, "step": 304, "step_time": 72.93971385998884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.331033319234848, "epoch": 0.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.8779069185256958, "kl": 0.04410824924707413, "learning_rate": 4.0785389610092684e-06, "loss": 0.0479, "num_tokens": 1700586.0, "reward": 0.59375, "reward_std": 0.2722131311893463, "rewards/reward_func/mean": 0.59375, "rewards/reward_func/std": 0.5475644469261169, "sampling/importance_sampling_ratio/max": 1.7427366971969604, "sampling/importance_sampling_ratio/mean": 1.1339176893234253, "sampling/importance_sampling_ratio/min": 0.7268555760383606, "sampling/sampling_logp_difference/max": 0.3699074983596802, "sampling/sampling_logp_difference/mean": 0.02840990573167801, "step": 305, "step_time": 38.05158003201359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3475850224494934, "epoch": 0.612, "frac_reward_zero_std": 0.0, "grad_norm": 1.3732694387435913, "kl": 0.05851783603429794, "learning_rate": 4.072252004459612e-06, "loss": -0.4087, "num_tokens": 1706255.0, "reward": 0.4462500214576721, "reward_std": 0.5143392086029053, "rewards/reward_func/mean": 0.4462500214576721, "rewards/reward_func/std": 0.5704118609428406, "sampling/importance_sampling_ratio/max": 2.909179925918579, "sampling/importance_sampling_ratio/mean": 1.355375051498413, "sampling/importance_sampling_ratio/min": 0.4884859621524811, "sampling/sampling_logp_difference/max": 0.7288825511932373, "sampling/sampling_logp_difference/mean": 0.031946711242198944, "step": 306, "step_time": 56.35153491600067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 44.375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.37247800827026367, "epoch": 0.614, "frac_reward_zero_std": 0.0, "grad_norm": 1.049514889717102, "kl": 0.05375540256500244, "learning_rate": 4.065948555730405e-06, "loss": 0.1078, "num_tokens": 1712211.0, "reward": 0.45625001192092896, "reward_std": 0.6163418889045715, "rewards/reward_func/mean": 0.45625001192092896, "rewards/reward_func/std": 0.5708875060081482, "sampling/importance_sampling_ratio/max": 1.4518539905548096, "sampling/importance_sampling_ratio/mean": 0.7474272847175598, "sampling/importance_sampling_ratio/min": 0.3841031789779663, "sampling/sampling_logp_difference/max": 0.5305154323577881, "sampling/sampling_logp_difference/mean": 0.030646320432424545, "step": 307, "step_time": 66.77364812800079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3100343346595764, "epoch": 0.616, "frac_reward_zero_std": 0.0, "grad_norm": 1.0289682149887085, "kl": 0.052720747888088226, "learning_rate": 4.059628680941843e-06, "loss": 0.0498, "num_tokens": 1717818.0, "reward": 0.21125000715255737, "reward_std": 0.29770827293395996, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.4665508270263672, "sampling/importance_sampling_ratio/max": 1.2022721767425537, "sampling/importance_sampling_ratio/mean": 0.9475799202919006, "sampling/importance_sampling_ratio/min": 0.5206012725830078, "sampling/sampling_logp_difference/max": 0.5640921592712402, "sampling/sampling_logp_difference/mean": 0.03175481781363487, "step": 308, "step_time": 52.89879798798938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 44.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.392910361289978, "epoch": 0.618, "frac_reward_zero_std": 0.0, "grad_norm": 1.1463991403579712, "kl": 0.09175321459770203, "learning_rate": 4.053292446386422e-06, "loss": 0.103, "num_tokens": 1722948.0, "reward": 0.32750001549720764, "reward_std": 0.5502669811248779, "rewards/reward_func/mean": 0.32750001549720764, "rewards/reward_func/std": 0.5303031802177429, "sampling/importance_sampling_ratio/max": 1.6144081354141235, "sampling/importance_sampling_ratio/mean": 0.8773033022880554, "sampling/importance_sampling_ratio/min": 0.24481238424777985, "sampling/sampling_logp_difference/max": 0.6958191394805908, "sampling/sampling_logp_difference/mean": 0.03132324665784836, "step": 309, "step_time": 62.81438364399946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 43.375, "completions/mean_terminated_length": 43.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3314189910888672, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 1.1365076303482056, "kl": 0.045356642454862595, "learning_rate": 4.046939918528243e-06, "loss": -0.0211, "num_tokens": 1728875.0, "reward": -0.04874999821186066, "reward_std": 0.03686491772532463, "rewards/reward_func/mean": -0.04874999821186066, "rewards/reward_func/std": 0.03482097014784813, "sampling/importance_sampling_ratio/max": 1.382016897201538, "sampling/importance_sampling_ratio/mean": 0.8258918523788452, "sampling/importance_sampling_ratio/min": 0.43768084049224854, "sampling/sampling_logp_difference/max": 0.34904003143310547, "sampling/sampling_logp_difference/mean": 0.02661317214369774, "step": 310, "step_time": 84.43736876899493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.342917263507843, "epoch": 0.622, "frac_reward_zero_std": 0.0, "grad_norm": 1.4713886976242065, "kl": 0.060660719871520996, "learning_rate": 4.040571164002319e-06, "loss": 0.0434, "num_tokens": 1734842.0, "reward": 0.3512499928474426, "reward_std": 0.5479995012283325, "rewards/reward_func/mean": 0.3512499928474426, "rewards/reward_func/std": 0.5240620970726013, "sampling/importance_sampling_ratio/max": 1.7727338075637817, "sampling/importance_sampling_ratio/mean": 0.9686833024024963, "sampling/importance_sampling_ratio/min": 0.39146628975868225, "sampling/sampling_logp_difference/max": 0.700446605682373, "sampling/sampling_logp_difference/mean": 0.029514621943235397, "step": 311, "step_time": 68.79045250298805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3210405707359314, "epoch": 0.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.8161787390708923, "kl": 0.04533413052558899, "learning_rate": 4.034186249613869e-06, "loss": 0.137, "num_tokens": 1740368.0, "reward": 0.0637500062584877, "reward_std": 0.2749331593513489, "rewards/reward_func/mean": 0.0637500062584877, "rewards/reward_func/std": 0.3796215355396271, "sampling/importance_sampling_ratio/max": 1.1369949579238892, "sampling/importance_sampling_ratio/mean": 0.693924069404602, "sampling/importance_sampling_ratio/min": 0.3688696622848511, "sampling/sampling_logp_difference/max": 0.5726242065429688, "sampling/sampling_logp_difference/mean": 0.02921966463327408, "step": 312, "step_time": 74.01787159900414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.625, "completions/mean_terminated_length": 40.625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3051733076572418, "epoch": 0.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.9667705297470093, "kl": 0.04298641160130501, "learning_rate": 4.027785242337626e-06, "loss": 0.0425, "num_tokens": 1745737.0, "reward": 0.4637500047683716, "reward_std": 0.6106890439987183, "rewards/reward_func/mean": 0.4637500047683716, "rewards/reward_func/std": 0.5657343864440918, "sampling/importance_sampling_ratio/max": 1.2044254541397095, "sampling/importance_sampling_ratio/mean": 0.7017180919647217, "sampling/importance_sampling_ratio/min": 0.41121870279312134, "sampling/sampling_logp_difference/max": 0.5601418018341064, "sampling/sampling_logp_difference/mean": 0.03372935950756073, "step": 313, "step_time": 56.49958878697362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.37881386280059814, "epoch": 0.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.9144023060798645, "kl": 0.03883805125951767, "learning_rate": 4.021368209317126e-06, "loss": 0.0706, "num_tokens": 1750627.0, "reward": 0.3125, "reward_std": 0.5669680833816528, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5413936972618103, "sampling/importance_sampling_ratio/max": 1.3021681308746338, "sampling/importance_sampling_ratio/mean": 0.6122154593467712, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7377749681472778, "sampling/sampling_logp_difference/mean": 0.03731653094291687, "step": 314, "step_time": 64.65534779199515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 44.375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.322407066822052, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 1.6095478534698486, "kl": 0.06635308265686035, "learning_rate": 4.014935217864009e-06, "loss": 0.0829, "num_tokens": 1756143.0, "reward": 0.3400000333786011, "reward_std": 0.5607088804244995, "rewards/reward_func/mean": 0.3400000333786011, "rewards/reward_func/std": 0.5389142632484436, "sampling/importance_sampling_ratio/max": 2.9857375621795654, "sampling/importance_sampling_ratio/mean": 1.097962737083435, "sampling/importance_sampling_ratio/min": 0.29625648260116577, "sampling/sampling_logp_difference/max": 0.9051809310913086, "sampling/sampling_logp_difference/mean": 0.030315592885017395, "step": 315, "step_time": 75.3316736620036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.354464590549469, "epoch": 0.632, "frac_reward_zero_std": 0.0, "grad_norm": 1.7500587701797485, "kl": 0.041008904576301575, "learning_rate": 4.008486335457312e-06, "loss": 0.2378, "num_tokens": 1761628.0, "reward": 0.0949999988079071, "reward_std": 0.28850340843200684, "rewards/reward_func/mean": 0.0949999988079071, "rewards/reward_func/std": 0.3677732050418854, "sampling/importance_sampling_ratio/max": 1.8479022979736328, "sampling/importance_sampling_ratio/mean": 0.8248315453529358, "sampling/importance_sampling_ratio/min": 0.3391701281070709, "sampling/sampling_logp_difference/max": 0.9850505590438843, "sampling/sampling_logp_difference/mean": 0.026787061244249344, "step": 316, "step_time": 72.44107799098128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.37994247674942017, "epoch": 0.634, "frac_reward_zero_std": 0.0, "grad_norm": 1.623810052871704, "kl": 0.03418804332613945, "learning_rate": 4.002021629742759e-06, "loss": -0.0948, "num_tokens": 1767506.0, "reward": 0.07000000029802322, "reward_std": 0.2821284532546997, "rewards/reward_func/mean": 0.07000000029802322, "rewards/reward_func/std": 0.37405118346214294, "sampling/importance_sampling_ratio/max": 2.5057566165924072, "sampling/importance_sampling_ratio/mean": 1.1749823093414307, "sampling/importance_sampling_ratio/min": 0.4858405590057373, "sampling/sampling_logp_difference/max": 0.3556022644042969, "sampling/sampling_logp_difference/mean": 0.03094809502363205, "step": 317, "step_time": 73.08309103900683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3214326500892639, "epoch": 0.636, "frac_reward_zero_std": 0.0, "grad_norm": 2.192352294921875, "kl": 0.05013597011566162, "learning_rate": 3.995541168532055e-06, "loss": 0.1681, "num_tokens": 1772800.0, "reward": 0.21500001847743988, "reward_std": 0.30573850870132446, "rewards/reward_func/mean": 0.21500001847743988, "rewards/reward_func/std": 0.47563493251800537, "sampling/importance_sampling_ratio/max": 2.769392967224121, "sampling/importance_sampling_ratio/mean": 1.9062137603759766, "sampling/importance_sampling_ratio/min": 0.9690021276473999, "sampling/sampling_logp_difference/max": 0.4255542755126953, "sampling/sampling_logp_difference/mean": 0.029431238770484924, "step": 318, "step_time": 78.88887128600618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3768948018550873, "epoch": 0.638, "frac_reward_zero_std": 0.0, "grad_norm": 1.7573028802871704, "kl": 0.041640929877758026, "learning_rate": 3.989045019802171e-06, "loss": -0.0145, "num_tokens": 1778980.0, "reward": 0.19999998807907104, "reward_std": 0.31466037034988403, "rewards/reward_func/mean": 0.19999998807907104, "rewards/reward_func/std": 0.46757736802101135, "sampling/importance_sampling_ratio/max": 2.0911169052124023, "sampling/importance_sampling_ratio/mean": 1.2725701332092285, "sampling/importance_sampling_ratio/min": 0.7229686379432678, "sampling/sampling_logp_difference/max": 0.35713261365890503, "sampling/sampling_logp_difference/mean": 0.030360868200659752, "step": 319, "step_time": 77.96896261701477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.36589163541793823, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 1.2818424701690674, "kl": 0.03474745154380798, "learning_rate": 3.982533251694632e-06, "loss": -0.3233, "num_tokens": 1785246.0, "reward": 0.21000000834465027, "reward_std": 0.308533251285553, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.4801190495491028, "sampling/importance_sampling_ratio/max": 1.9405113458633423, "sampling/importance_sampling_ratio/mean": 1.1001548767089844, "sampling/importance_sampling_ratio/min": 0.4418053925037384, "sampling/sampling_logp_difference/max": 0.6586148738861084, "sampling/sampling_logp_difference/mean": 0.02688867226243019, "step": 320, "step_time": 76.73345412599156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.29908668994903564, "epoch": 0.642, "frac_reward_zero_std": 0.0, "grad_norm": 1.0722907781600952, "kl": 0.05384838581085205, "learning_rate": 3.976005932514807e-06, "loss": -0.0787, "num_tokens": 1790214.0, "reward": 0.45875000953674316, "reward_std": 0.5092880129814148, "rewards/reward_func/mean": 0.45875000953674316, "rewards/reward_func/std": 0.5402231812477112, "sampling/importance_sampling_ratio/max": 2.0527751445770264, "sampling/importance_sampling_ratio/mean": 1.0192339420318604, "sampling/importance_sampling_ratio/min": 0.3669769763946533, "sampling/sampling_logp_difference/max": 0.6636786460876465, "sampling/sampling_logp_difference/mean": 0.02793467789888382, "step": 321, "step_time": 52.20804076900822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.875, "completions/mean_terminated_length": 49.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35425591468811035, "epoch": 0.644, "frac_reward_zero_std": 0.0, "grad_norm": 1.0075650215148926, "kl": 0.07112079858779907, "learning_rate": 3.969463130731183e-06, "loss": -0.2281, "num_tokens": 1796411.0, "reward": 0.48625001311302185, "reward_std": 0.5877156257629395, "rewards/reward_func/mean": 0.48625001311302185, "rewards/reward_func/std": 0.5441622138023376, "sampling/importance_sampling_ratio/max": 2.1321513652801514, "sampling/importance_sampling_ratio/mean": 0.815255343914032, "sampling/importance_sampling_ratio/min": 0.3492589294910431, "sampling/sampling_logp_difference/max": 0.6331918239593506, "sampling/sampling_logp_difference/mean": 0.02603982575237751, "step": 322, "step_time": 75.52292313199723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 44.125, "completions/mean_terminated_length": 44.125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.31836074590682983, "epoch": 0.646, "frac_reward_zero_std": 0.0, "grad_norm": 1.4238131046295166, "kl": 0.046384476125240326, "learning_rate": 3.962904914974656e-06, "loss": 0.0372, "num_tokens": 1801901.0, "reward": 0.35374999046325684, "reward_std": 0.5490298271179199, "rewards/reward_func/mean": 0.35374999046325684, "rewards/reward_func/std": 0.5245934128761292, "sampling/importance_sampling_ratio/max": 1.4118305444717407, "sampling/importance_sampling_ratio/mean": 0.8808070421218872, "sampling/importance_sampling_ratio/min": 0.5332664847373962, "sampling/sampling_logp_difference/max": 0.8431804180145264, "sampling/sampling_logp_difference/mean": 0.028243277221918106, "step": 323, "step_time": 68.1251233840012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.31607919931411743, "epoch": 0.648, "frac_reward_zero_std": 0.0, "grad_norm": 1.3807119131088257, "kl": 0.07309075444936752, "learning_rate": 3.956331354037805e-06, "loss": -0.0479, "num_tokens": 1806905.0, "reward": 0.21124999225139618, "reward_std": 0.3054782450199127, "rewards/reward_func/mean": 0.21124999225139618, "rewards/reward_func/std": 0.4696028232574463, "sampling/importance_sampling_ratio/max": 1.787608027458191, "sampling/importance_sampling_ratio/mean": 1.0451674461364746, "sampling/importance_sampling_ratio/min": 0.46881166100502014, "sampling/sampling_logp_difference/max": 0.5253305435180664, "sampling/sampling_logp_difference/mean": 0.029314052313566208, "step": 324, "step_time": 59.86747224899591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3600061535835266, "epoch": 0.65, "frac_reward_zero_std": 0.0, "grad_norm": 1.817853569984436, "kl": 0.09967118501663208, "learning_rate": 3.949742516874175e-06, "loss": 0.2608, "num_tokens": 1812735.0, "reward": 0.20499999821186066, "reward_std": 0.3137704133987427, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.47461265325546265, "sampling/importance_sampling_ratio/max": 2.5692901611328125, "sampling/importance_sampling_ratio/mean": 0.9470885992050171, "sampling/importance_sampling_ratio/min": 0.3106057047843933, "sampling/sampling_logp_difference/max": 0.8942482471466064, "sampling/sampling_logp_difference/mean": 0.03815475106239319, "step": 325, "step_time": 61.724708480003756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 41.125, "completions/mean_terminated_length": 41.125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3302639126777649, "epoch": 0.652, "frac_reward_zero_std": 0.0, "grad_norm": 1.5889121294021606, "kl": 0.032158948481082916, "learning_rate": 3.943138472597549e-06, "loss": 0.0395, "num_tokens": 1817852.0, "reward": 0.08499999344348907, "reward_std": 0.28810441493988037, "rewards/reward_func/mean": 0.08499999344348907, "rewards/reward_func/std": 0.37232860922813416, "sampling/importance_sampling_ratio/max": 2.248680830001831, "sampling/importance_sampling_ratio/mean": 1.1071228981018066, "sampling/importance_sampling_ratio/min": 0.4287269413471222, "sampling/sampling_logp_difference/max": 0.4500095844268799, "sampling/sampling_logp_difference/mean": 0.03271816670894623, "step": 326, "step_time": 78.85871165001299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 41.875, "completions/mean_terminated_length": 41.875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.3419029712677002, "epoch": 0.654, "frac_reward_zero_std": 0.0, "grad_norm": 2.6311590671539307, "kl": 0.047403544187545776, "learning_rate": 3.936519290481226e-06, "loss": -0.2247, "num_tokens": 1823582.0, "reward": 0.19874998927116394, "reward_std": 0.518446683883667, "rewards/reward_func/mean": 0.19874998927116394, "rewards/reward_func/std": 0.48034030199050903, "sampling/importance_sampling_ratio/max": 1.9648873805999756, "sampling/importance_sampling_ratio/mean": 1.09955632686615, "sampling/importance_sampling_ratio/min": 0.5106386542320251, "sampling/sampling_logp_difference/max": 0.47838133573532104, "sampling/sampling_logp_difference/mean": 0.03344731032848358, "step": 327, "step_time": 74.20873203998781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.31387221813201904, "epoch": 0.656, "frac_reward_zero_std": 0.0, "grad_norm": 1.6050386428833008, "kl": 0.0334724560379982, "learning_rate": 3.929885039957296e-06, "loss": 0.1015, "num_tokens": 1828698.0, "reward": 0.1899999976158142, "reward_std": 0.33716854453086853, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.4895770847797394, "sampling/importance_sampling_ratio/max": 1.870469570159912, "sampling/importance_sampling_ratio/mean": 0.8102731108665466, "sampling/importance_sampling_ratio/min": 0.3841648995876312, "sampling/sampling_logp_difference/max": 0.6972520351409912, "sampling/sampling_logp_difference/mean": 0.02928170934319496, "step": 328, "step_time": 132.74387937001302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.33447951078414917, "epoch": 0.658, "frac_reward_zero_std": 0.0, "grad_norm": 1.379056453704834, "kl": 0.03984800726175308, "learning_rate": 3.923235790615907e-06, "loss": -0.1686, "num_tokens": 1834063.0, "reward": 0.21125000715255737, "reward_std": 0.506218671798706, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.46896353363990784, "sampling/importance_sampling_ratio/max": 1.307706594467163, "sampling/importance_sampling_ratio/mean": 0.8228154182434082, "sampling/importance_sampling_ratio/min": 0.5563095211982727, "sampling/sampling_logp_difference/max": 0.5035196542739868, "sampling/sampling_logp_difference/mean": 0.025627177208662033, "step": 329, "step_time": 143.56957943798625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 39.125, "completions/mean_terminated_length": 39.125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3598230183124542, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 1.37947416305542, "kl": 0.0858326181769371, "learning_rate": 3.916571612204538e-06, "loss": -0.1299, "num_tokens": 1839339.0, "reward": 0.21250000596046448, "reward_std": 0.5239397287368774, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.4850846827030182, "sampling/importance_sampling_ratio/max": 1.8522554636001587, "sampling/importance_sampling_ratio/mean": 1.0172133445739746, "sampling/importance_sampling_ratio/min": 0.35823509097099304, "sampling/sampling_logp_difference/max": 0.7279484272003174, "sampling/sampling_logp_difference/mean": 0.03425194323062897, "step": 330, "step_time": 138.2842517439858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3195509910583496, "epoch": 0.662, "frac_reward_zero_std": 0.0, "grad_norm": 2.0245273113250732, "kl": 0.04566916078329086, "learning_rate": 3.909892574627267e-06, "loss": -0.0532, "num_tokens": 1845149.0, "reward": 0.3187499940395355, "reward_std": 0.5860832333564758, "rewards/reward_func/mean": 0.3187499940395355, "rewards/reward_func/std": 0.56430584192276, "sampling/importance_sampling_ratio/max": 2.2832202911376953, "sampling/importance_sampling_ratio/mean": 1.1496918201446533, "sampling/importance_sampling_ratio/min": 0.5304498672485352, "sampling/sampling_logp_difference/max": 0.8081755638122559, "sampling/sampling_logp_difference/mean": 0.028967654332518578, "step": 331, "step_time": 137.66897978598718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.32289984822273254, "epoch": 0.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.8185482025146484, "kl": 0.10041403025388718, "learning_rate": 3.903198747944037e-06, "loss": 0.1168, "num_tokens": 1850899.0, "reward": 0.22624999284744263, "reward_std": 0.3100869655609131, "rewards/reward_func/mean": 0.22624999284744263, "rewards/reward_func/std": 0.47853758931159973, "sampling/importance_sampling_ratio/max": 1.338444471359253, "sampling/importance_sampling_ratio/mean": 0.8065738677978516, "sampling/importance_sampling_ratio/min": 0.3798188865184784, "sampling/sampling_logp_difference/max": 0.8146078586578369, "sampling/sampling_logp_difference/mean": 0.02689986675977707, "step": 332, "step_time": 131.41008436502307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3707420527935028, "epoch": 0.666, "frac_reward_zero_std": 0.0, "grad_norm": 1.2363697290420532, "kl": 0.03508942574262619, "learning_rate": 3.896490202369924e-06, "loss": 0.0616, "num_tokens": 1856034.0, "reward": 0.32124999165534973, "reward_std": 0.5887609720230103, "rewards/reward_func/mean": 0.32124999165534973, "rewards/reward_func/std": 0.563393771648407, "sampling/importance_sampling_ratio/max": 1.6524691581726074, "sampling/importance_sampling_ratio/mean": 0.8412412405014038, "sampling/importance_sampling_ratio/min": 0.23496931791305542, "sampling/sampling_logp_difference/max": 0.6445038318634033, "sampling/sampling_logp_difference/mean": 0.03339887410402298, "step": 333, "step_time": 111.69739279698115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.33553531765937805, "epoch": 0.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.9219703674316406, "kl": 0.09176594018936157, "learning_rate": 3.889767008274396e-06, "loss": 0.2604, "num_tokens": 1861621.0, "reward": 0.3537500202655792, "reward_std": 0.5339703559875488, "rewards/reward_func/mean": 0.3537500202655792, "rewards/reward_func/std": 0.5136407017707825, "sampling/importance_sampling_ratio/max": 1.8493578433990479, "sampling/importance_sampling_ratio/mean": 1.008284091949463, "sampling/importance_sampling_ratio/min": 0.41284075379371643, "sampling/sampling_logp_difference/max": 0.6564333438873291, "sampling/sampling_logp_difference/mean": 0.025348259136080742, "step": 334, "step_time": 143.32291307201376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 39.625, "completions/mean_terminated_length": 39.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3513152301311493, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.9763464331626892, "kl": 0.06704329699277878, "learning_rate": 3.883029236180577e-06, "loss": -0.1172, "num_tokens": 1867778.0, "reward": 0.35374999046325684, "reward_std": 0.2704784870147705, "rewards/reward_func/mean": 0.35374999046325684, "rewards/reward_func/std": 0.5355620980262756, "sampling/importance_sampling_ratio/max": 1.107627272605896, "sampling/importance_sampling_ratio/mean": 0.7351757287979126, "sampling/importance_sampling_ratio/min": 0.19063004851341248, "sampling/sampling_logp_difference/max": 0.8456048965454102, "sampling/sampling_logp_difference/mean": 0.033929385244846344, "step": 335, "step_time": 103.19867493197671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 41.875, "completions/mean_terminated_length": 41.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.31681621074676514, "epoch": 0.672, "frac_reward_zero_std": 0.0, "grad_norm": 1.7861030101776123, "kl": 0.03690392151474953, "learning_rate": 3.876276956764509e-06, "loss": 0.2589, "num_tokens": 1872931.0, "reward": 0.21375000476837158, "reward_std": 0.32227829098701477, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.4870299994945526, "sampling/importance_sampling_ratio/max": 2.2250986099243164, "sampling/importance_sampling_ratio/mean": 1.0775128602981567, "sampling/importance_sampling_ratio/min": 0.5108433961868286, "sampling/sampling_logp_difference/max": 0.35452377796173096, "sampling/sampling_logp_difference/mean": 0.026147497817873955, "step": 336, "step_time": 108.95734459199593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3189052939414978, "epoch": 0.674, "frac_reward_zero_std": 0.0, "grad_norm": 1.9657090902328491, "kl": 0.06692003458738327, "learning_rate": 3.869510240854408e-06, "loss": 0.1239, "num_tokens": 1878410.0, "reward": 0.32249999046325684, "reward_std": 0.5750815868377686, "rewards/reward_func/mean": 0.32249999046325684, "rewards/reward_func/std": 0.5570265650749207, "sampling/importance_sampling_ratio/max": 2.461669445037842, "sampling/importance_sampling_ratio/mean": 1.2536345720291138, "sampling/importance_sampling_ratio/min": 0.7026631236076355, "sampling/sampling_logp_difference/max": 0.595012903213501, "sampling/sampling_logp_difference/mean": 0.025707338005304337, "step": 337, "step_time": 101.68271847401047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3161134421825409, "epoch": 0.676, "frac_reward_zero_std": 0.0, "grad_norm": 1.2771250009536743, "kl": 0.017098795622587204, "learning_rate": 3.862729159429921e-06, "loss": -0.2422, "num_tokens": 1883892.0, "reward": 0.7250000238418579, "reward_std": 0.49705883860588074, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.4603104889392853, "sampling/importance_sampling_ratio/max": 2.6572251319885254, "sampling/importance_sampling_ratio/mean": 1.1089489459991455, "sampling/importance_sampling_ratio/min": 0.5457375645637512, "sampling/sampling_logp_difference/max": 0.491180419921875, "sampling/sampling_logp_difference/mean": 0.02168424054980278, "step": 338, "step_time": 114.0666631339991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.396328866481781, "epoch": 0.678, "frac_reward_zero_std": 0.0, "grad_norm": 1.4366711378097534, "kl": 0.17721709609031677, "learning_rate": 3.855933783621384e-06, "loss": 0.1282, "num_tokens": 1889200.0, "reward": 0.30000001192092896, "reward_std": 0.30565518140792847, "rewards/reward_func/mean": 0.30000001192092896, "rewards/reward_func/std": 0.5595406293869019, "sampling/importance_sampling_ratio/max": 1.7435904741287231, "sampling/importance_sampling_ratio/mean": 0.9623221158981323, "sampling/importance_sampling_ratio/min": 0.3996666669845581, "sampling/sampling_logp_difference/max": 0.7043921947479248, "sampling/sampling_logp_difference/mean": 0.03383718058466911, "step": 339, "step_time": 126.46415012000944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3627287745475769, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 1.1164697408676147, "kl": 0.04694604501128197, "learning_rate": 3.849124184709073e-06, "loss": 0.0417, "num_tokens": 1894511.0, "reward": 0.06749999523162842, "reward_std": 0.2736448645591736, "rewards/reward_func/mean": 0.06749999523162842, "rewards/reward_func/std": 0.3591557443141937, "sampling/importance_sampling_ratio/max": 1.6633166074752808, "sampling/importance_sampling_ratio/mean": 1.0712683200836182, "sampling/importance_sampling_ratio/min": 0.5509455800056458, "sampling/sampling_logp_difference/max": 0.3140767812728882, "sampling/sampling_logp_difference/mean": 0.02482220157980919, "step": 340, "step_time": 162.8545896350115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.31078046560287476, "epoch": 0.682, "frac_reward_zero_std": 0.0, "grad_norm": 2.7845726013183594, "kl": 0.04270056635141373, "learning_rate": 3.84230043412246e-06, "loss": -0.2732, "num_tokens": 1900006.0, "reward": 0.32625001668930054, "reward_std": 0.5653538703918457, "rewards/reward_func/mean": 0.32625001668930054, "rewards/reward_func/std": 0.5494916439056396, "sampling/importance_sampling_ratio/max": 2.5016562938690186, "sampling/importance_sampling_ratio/mean": 1.0579283237457275, "sampling/importance_sampling_ratio/min": 0.6514557003974915, "sampling/sampling_logp_difference/max": 0.5734856128692627, "sampling/sampling_logp_difference/mean": 0.029540089890360832, "step": 341, "step_time": 155.76240094099194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3381982445716858, "epoch": 0.684, "frac_reward_zero_std": 0.0, "grad_norm": 1.8940988779067993, "kl": 0.1022232174873352, "learning_rate": 3.835462603439458e-06, "loss": 0.1884, "num_tokens": 1904962.0, "reward": 0.21625001728534698, "reward_std": 0.31474921107292175, "rewards/reward_func/mean": 0.21625001728534698, "rewards/reward_func/std": 0.4785973131656647, "sampling/importance_sampling_ratio/max": 1.3884916305541992, "sampling/importance_sampling_ratio/mean": 1.0323078632354736, "sampling/importance_sampling_ratio/min": 0.5798347592353821, "sampling/sampling_logp_difference/max": 0.6027919054031372, "sampling/sampling_logp_difference/mean": 0.026107758283615112, "step": 342, "step_time": 166.86433797102654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 40.625, "completions/mean_terminated_length": 40.625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.33497440814971924, "epoch": 0.686, "frac_reward_zero_std": 0.0, "grad_norm": 2.1610031127929688, "kl": 0.09758682548999786, "learning_rate": 3.828610764385676e-06, "loss": -0.0412, "num_tokens": 1911022.0, "reward": -0.07250000536441803, "reward_std": 0.054318200796842575, "rewards/reward_func/mean": -0.07250000536441803, "rewards/reward_func/std": 0.054967526346445084, "sampling/importance_sampling_ratio/max": 1.8711848258972168, "sampling/importance_sampling_ratio/mean": 1.0269144773483276, "sampling/importance_sampling_ratio/min": 0.13635054230690002, "sampling/sampling_logp_difference/max": 1.1250584125518799, "sampling/sampling_logp_difference/mean": 0.0336228646337986, "step": 343, "step_time": 180.27627392599243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.2781206965446472, "epoch": 0.688, "frac_reward_zero_std": 0.0, "grad_norm": 1.1632080078125, "kl": 0.10593483597040176, "learning_rate": 3.821744988833664e-06, "loss": 0.0054, "num_tokens": 1916625.0, "reward": 0.3199999928474426, "reward_std": 0.5272395610809326, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.5035871863365173, "sampling/importance_sampling_ratio/max": 1.4332565069198608, "sampling/importance_sampling_ratio/mean": 0.9319165945053101, "sampling/importance_sampling_ratio/min": 0.38523051142692566, "sampling/sampling_logp_difference/max": 0.8005368709564209, "sampling/sampling_logp_difference/mean": 0.020923875272274017, "step": 344, "step_time": 169.41366141600884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 39.375, "completions/mean_terminated_length": 39.375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.2810716927051544, "epoch": 0.69, "frac_reward_zero_std": 0.0, "grad_norm": 1.4414160251617432, "kl": 0.06682641804218292, "learning_rate": 3.814865348802157e-06, "loss": -0.2297, "num_tokens": 1921399.0, "reward": 0.21000000834465027, "reward_std": 0.5288327932357788, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.48986876010894775, "sampling/importance_sampling_ratio/max": 2.1541748046875, "sampling/importance_sampling_ratio/mean": 1.0254625082015991, "sampling/importance_sampling_ratio/min": 0.4324725568294525, "sampling/sampling_logp_difference/max": 0.8203954696655273, "sampling/sampling_logp_difference/mean": 0.026356138288974762, "step": 345, "step_time": 129.16487764098565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.34982192516326904, "epoch": 0.692, "frac_reward_zero_std": 0.0, "grad_norm": 1.2038586139678955, "kl": 0.16085557639598846, "learning_rate": 3.807971916455325e-06, "loss": 0.0815, "num_tokens": 1926202.0, "reward": 0.07124999910593033, "reward_std": 0.27993497252464294, "rewards/reward_func/mean": 0.07124999910593033, "rewards/reward_func/std": 0.3617590665817261, "sampling/importance_sampling_ratio/max": 1.7573891878128052, "sampling/importance_sampling_ratio/mean": 0.9032962322235107, "sampling/importance_sampling_ratio/min": 0.29005834460258484, "sampling/sampling_logp_difference/max": 1.3162736892700195, "sampling/sampling_logp_difference/mean": 0.03286924958229065, "step": 346, "step_time": 146.0007931359869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3232361674308777, "epoch": 0.694, "frac_reward_zero_std": 0.0, "grad_norm": 1.0941061973571777, "kl": 0.026414429768919945, "learning_rate": 3.8010647641020116e-06, "loss": 0.1266, "num_tokens": 1931733.0, "reward": 0.5987499952316284, "reward_std": 0.5347613096237183, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.5155701041221619, "sampling/importance_sampling_ratio/max": 1.3389661312103271, "sampling/importance_sampling_ratio/mean": 0.6809048056602478, "sampling/importance_sampling_ratio/min": 0.19245320558547974, "sampling/sampling_logp_difference/max": 1.0144225358963013, "sampling/sampling_logp_difference/mean": 0.02918568253517151, "step": 347, "step_time": 71.32271579199005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3079647123813629, "epoch": 0.696, "frac_reward_zero_std": 0.0, "grad_norm": 1.154685139656067, "kl": 0.09130249172449112, "learning_rate": 3.794143964194976e-06, "loss": -0.0868, "num_tokens": 1936951.0, "reward": 0.46250003576278687, "reward_std": 0.5227590799331665, "rewards/reward_func/mean": 0.46250003576278687, "rewards/reward_func/std": 0.5515885949134827, "sampling/importance_sampling_ratio/max": 1.4307719469070435, "sampling/importance_sampling_ratio/mean": 0.8543438911437988, "sampling/importance_sampling_ratio/min": 0.3845389783382416, "sampling/sampling_logp_difference/max": 0.6325764656066895, "sampling/sampling_logp_difference/mean": 0.03074759989976883, "step": 348, "step_time": 61.10349588000099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.32811909914016724, "epoch": 0.698, "frac_reward_zero_std": 0.0, "grad_norm": 1.4756181240081787, "kl": 0.07114064693450928, "learning_rate": 3.7872095893301344e-06, "loss": 0.232, "num_tokens": 1942770.0, "reward": 0.3100000023841858, "reward_std": 0.31517019867897034, "rewards/reward_func/mean": 0.3100000023841858, "rewards/reward_func/std": 0.5469656586647034, "sampling/importance_sampling_ratio/max": 1.6810601949691772, "sampling/importance_sampling_ratio/mean": 0.9875794649124146, "sampling/importance_sampling_ratio/min": 0.3065728545188904, "sampling/sampling_logp_difference/max": 0.4949173927307129, "sampling/sampling_logp_difference/mean": 0.025241130962967873, "step": 349, "step_time": 48.66686181901605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3392306864261627, "epoch": 0.7, "frac_reward_zero_std": 0.0, "grad_norm": 1.145804524421692, "kl": 0.03867422789335251, "learning_rate": 3.7802617122457976e-06, "loss": 0.0707, "num_tokens": 1948625.0, "reward": 0.09000000357627869, "reward_std": 0.2660285234451294, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.3568112850189209, "sampling/importance_sampling_ratio/max": 1.6579951047897339, "sampling/importance_sampling_ratio/mean": 1.0459256172180176, "sampling/importance_sampling_ratio/min": 0.5915707945823669, "sampling/sampling_logp_difference/max": 0.4812997579574585, "sampling/sampling_logp_difference/mean": 0.02722543105483055, "step": 350, "step_time": 76.36607002699748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3211192488670349, "epoch": 0.702, "frac_reward_zero_std": 0.0, "grad_norm": 1.0469977855682373, "kl": 0.027719926089048386, "learning_rate": 3.773300405821908e-06, "loss": -0.0345, "num_tokens": 1954448.0, "reward": 0.32375001907348633, "reward_std": 0.281686395406723, "rewards/reward_func/mean": 0.32375001907348633, "rewards/reward_func/std": 0.527769923210144, "sampling/importance_sampling_ratio/max": 1.9723066091537476, "sampling/importance_sampling_ratio/mean": 1.1106541156768799, "sampling/importance_sampling_ratio/min": 0.5282062292098999, "sampling/sampling_logp_difference/max": 0.3540763854980469, "sampling/sampling_logp_difference/mean": 0.02676708996295929, "step": 351, "step_time": 78.26852857100312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3046337366104126, "epoch": 0.704, "frac_reward_zero_std": 0.0, "grad_norm": 1.208288550376892, "kl": 0.03900061175227165, "learning_rate": 3.766325743079277e-06, "loss": -0.1125, "num_tokens": 1959253.0, "reward": 0.48375001549720764, "reward_std": 0.5962894558906555, "rewards/reward_func/mean": 0.48375001549720764, "rewards/reward_func/std": 0.5521371364593506, "sampling/importance_sampling_ratio/max": 1.670057773590088, "sampling/importance_sampling_ratio/mean": 0.9210529327392578, "sampling/importance_sampling_ratio/min": 0.5158092379570007, "sampling/sampling_logp_difference/max": 0.432373046875, "sampling/sampling_logp_difference/mean": 0.026169460266828537, "step": 352, "step_time": 46.398978653975064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.31318768858909607, "epoch": 0.706, "frac_reward_zero_std": 0.0, "grad_norm": 1.743395209312439, "kl": 0.0346795991063118, "learning_rate": 3.7593377971788162e-06, "loss": 0.1768, "num_tokens": 1964058.0, "reward": 0.22500000894069672, "reward_std": 0.31266871094703674, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.4766250550746918, "sampling/importance_sampling_ratio/max": 1.8619552850723267, "sampling/importance_sampling_ratio/mean": 1.091329574584961, "sampling/importance_sampling_ratio/min": 0.28429219126701355, "sampling/sampling_logp_difference/max": 0.7750775814056396, "sampling/sampling_logp_difference/mean": 0.031623724848032, "step": 353, "step_time": 58.123137532005785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.35126036405563354, "epoch": 0.708, "frac_reward_zero_std": 0.0, "grad_norm": 1.4767982959747314, "kl": 0.033217888325452805, "learning_rate": 3.752336641420772e-06, "loss": 0.0776, "num_tokens": 1968954.0, "reward": 0.058750007301568985, "reward_std": 0.2922004461288452, "rewards/reward_func/mean": 0.058750007301568985, "rewards/reward_func/std": 0.3824521601200104, "sampling/importance_sampling_ratio/max": 1.434964895248413, "sampling/importance_sampling_ratio/mean": 1.084316372871399, "sampling/importance_sampling_ratio/min": 0.5846006870269775, "sampling/sampling_logp_difference/max": 0.4179229736328125, "sampling/sampling_logp_difference/mean": 0.0240701362490654, "step": 354, "step_time": 59.292683080013376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.328327476978302, "epoch": 0.71, "frac_reward_zero_std": 0.0, "grad_norm": 1.247985601425171, "kl": 0.08559633791446686, "learning_rate": 3.7453223492439544e-06, "loss": 0.0737, "num_tokens": 1975108.0, "reward": 0.4699999988079071, "reward_std": 0.5927736163139343, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5489730834960938, "sampling/importance_sampling_ratio/max": 1.4081599712371826, "sampling/importance_sampling_ratio/mean": 0.8387018442153931, "sampling/importance_sampling_ratio/min": 0.19741253554821014, "sampling/sampling_logp_difference/max": 0.9286923408508301, "sampling/sampling_logp_difference/mean": 0.03092752769589424, "step": 355, "step_time": 75.66988031598157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.28474941849708557, "epoch": 0.712, "frac_reward_zero_std": 0.0, "grad_norm": 1.736219882965088, "kl": 0.04625285789370537, "learning_rate": 3.7382949942249695e-06, "loss": 0.2333, "num_tokens": 1980329.0, "reward": 0.3212500214576721, "reward_std": 0.5408031344413757, "rewards/reward_func/mean": 0.3212500214576721, "rewards/reward_func/std": 0.5195723176002502, "sampling/importance_sampling_ratio/max": 2.1159582138061523, "sampling/importance_sampling_ratio/mean": 1.136248230934143, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6177792549133301, "sampling/sampling_logp_difference/mean": 0.02496938779950142, "step": 356, "step_time": 65.68752502801362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.32189249992370605, "epoch": 0.714, "frac_reward_zero_std": 0.0, "grad_norm": 1.1041979789733887, "kl": 0.060013748705387115, "learning_rate": 3.731254650077446e-06, "loss": -0.0802, "num_tokens": 1985708.0, "reward": 0.44624999165534973, "reward_std": 0.6114711165428162, "rewards/reward_func/mean": 0.44624999165534973, "rewards/reward_func/std": 0.5663905143737793, "sampling/importance_sampling_ratio/max": 1.5184648036956787, "sampling/importance_sampling_ratio/mean": 0.783623218536377, "sampling/importance_sampling_ratio/min": 0.3887534737586975, "sampling/sampling_logp_difference/max": 0.5213687419891357, "sampling/sampling_logp_difference/mean": 0.025201398879289627, "step": 357, "step_time": 61.06893451101496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 44.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34849998354911804, "epoch": 0.716, "frac_reward_zero_std": 0.0, "grad_norm": 1.8569178581237793, "kl": 0.10115896165370941, "learning_rate": 3.724201390651263e-06, "loss": 0.0172, "num_tokens": 1991176.0, "reward": 0.0912499949336052, "reward_std": 0.27447310090065, "rewards/reward_func/mean": 0.0912499949336052, "rewards/reward_func/std": 0.36868250370025635, "sampling/importance_sampling_ratio/max": 2.399606943130493, "sampling/importance_sampling_ratio/mean": 1.0813590288162231, "sampling/importance_sampling_ratio/min": 0.29478445649147034, "sampling/sampling_logp_difference/max": 1.0274195671081543, "sampling/sampling_logp_difference/mean": 0.03125939890742302, "step": 358, "step_time": 74.65547078498639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.36881792545318604, "epoch": 0.718, "frac_reward_zero_std": 0.0, "grad_norm": 1.5232397317886353, "kl": 0.06990374624729156, "learning_rate": 3.7171352899317743e-06, "loss": 0.0231, "num_tokens": 1997445.0, "reward": 0.2199999988079071, "reward_std": 0.3053818643093109, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.4732864201068878, "sampling/importance_sampling_ratio/max": 1.7546019554138184, "sampling/importance_sampling_ratio/mean": 0.9471590518951416, "sampling/importance_sampling_ratio/min": 0.46070781350135803, "sampling/sampling_logp_difference/max": 0.7129201889038086, "sampling/sampling_logp_difference/mean": 0.028827045112848282, "step": 359, "step_time": 74.88154268401559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.36909693479537964, "epoch": 0.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.8818954825401306, "kl": 0.040109992027282715, "learning_rate": 3.710056422039033e-06, "loss": 0.2106, "num_tokens": 2003046.0, "reward": 0.3062500059604645, "reward_std": 0.5747673511505127, "rewards/reward_func/mean": 0.3062500059604645, "rewards/reward_func/std": 0.5578514337539673, "sampling/importance_sampling_ratio/max": 2.0071616172790527, "sampling/importance_sampling_ratio/mean": 1.0366851091384888, "sampling/importance_sampling_ratio/min": 0.5076926946640015, "sampling/sampling_logp_difference/max": 0.45261478424072266, "sampling/sampling_logp_difference/mean": 0.028059128671884537, "step": 360, "step_time": 67.58852625099826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.37454652786254883, "epoch": 0.722, "frac_reward_zero_std": 0.0, "grad_norm": 1.1584450006484985, "kl": 0.01666010171175003, "learning_rate": 3.702964861227013e-06, "loss": 0.0801, "num_tokens": 2008281.0, "reward": -0.08624999970197678, "reward_std": 0.0722728967666626, "rewards/reward_func/mean": -0.08624999970197678, "rewards/reward_func/std": 0.06781013309955597, "sampling/importance_sampling_ratio/max": 1.410200595855713, "sampling/importance_sampling_ratio/mean": 0.9846078753471375, "sampling/importance_sampling_ratio/min": 0.7172226309776306, "sampling/sampling_logp_difference/max": 0.46905517578125, "sampling/sampling_logp_difference/mean": 0.025215893983840942, "step": 361, "step_time": 91.82594713801518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 39.375, "completions/mean_terminated_length": 39.375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.3056425154209137, "epoch": 0.724, "frac_reward_zero_std": 0.0, "grad_norm": 1.8975749015808105, "kl": 0.11677496135234833, "learning_rate": 3.695860681882832e-06, "loss": 0.0079, "num_tokens": 2014004.0, "reward": 0.4437500238418579, "reward_std": 0.6349660754203796, "rewards/reward_func/mean": 0.4437500238418579, "rewards/reward_func/std": 0.5882161259651184, "sampling/importance_sampling_ratio/max": 2.2386791706085205, "sampling/importance_sampling_ratio/mean": 1.0769392251968384, "sampling/importance_sampling_ratio/min": 0.508983850479126, "sampling/sampling_logp_difference/max": 0.8052873611450195, "sampling/sampling_logp_difference/mean": 0.029708731919527054, "step": 362, "step_time": 70.66860884500784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3323149085044861, "epoch": 0.726, "frac_reward_zero_std": 0.0, "grad_norm": 1.601610541343689, "kl": 0.044327329844236374, "learning_rate": 3.6887439585259693e-06, "loss": -0.0394, "num_tokens": 2019115.0, "reward": 0.19749999046325684, "reward_std": 0.5344071984291077, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.49485206604003906, "sampling/importance_sampling_ratio/max": 1.6172147989273071, "sampling/importance_sampling_ratio/mean": 1.0461113452911377, "sampling/importance_sampling_ratio/min": 0.661210834980011, "sampling/sampling_logp_difference/max": 0.423846960067749, "sampling/sampling_logp_difference/mean": 0.02795753814280033, "step": 363, "step_time": 64.47163575098966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3601047396659851, "epoch": 0.728, "frac_reward_zero_std": 0.0, "grad_norm": 1.0017635822296143, "kl": 0.026227379217743874, "learning_rate": 3.6816147658074864e-06, "loss": 0.0791, "num_tokens": 2024411.0, "reward": 0.2212499976158142, "reward_std": 0.5133354663848877, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.4760383367538452, "sampling/importance_sampling_ratio/max": 1.4538358449935913, "sampling/importance_sampling_ratio/mean": 0.9334630966186523, "sampling/importance_sampling_ratio/min": 0.5542329549789429, "sampling/sampling_logp_difference/max": 0.3732813596725464, "sampling/sampling_logp_difference/mean": 0.02371375635266304, "step": 364, "step_time": 62.915858155989554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.33008331060409546, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 1.4119970798492432, "kl": 0.02246996760368347, "learning_rate": 3.6744731785092396e-06, "loss": 0.1875, "num_tokens": 2029629.0, "reward": 0.4650000333786011, "reward_std": 0.4775117039680481, "rewards/reward_func/mean": 0.4650000333786011, "rewards/reward_func/std": 0.5166375041007996, "sampling/importance_sampling_ratio/max": 1.4311546087265015, "sampling/importance_sampling_ratio/mean": 0.8069183230400085, "sampling/importance_sampling_ratio/min": 0.30695483088493347, "sampling/sampling_logp_difference/max": 0.8008233308792114, "sampling/sampling_logp_difference/mean": 0.028506487607955933, "step": 365, "step_time": 69.22173458198085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34061259031295776, "epoch": 0.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.749092161655426, "kl": 0.06381135433912277, "learning_rate": 3.6673192715431016e-06, "loss": 0.1062, "num_tokens": 2035390.0, "reward": 0.3412500023841858, "reward_std": 0.5657950639724731, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.5442803502082825, "sampling/importance_sampling_ratio/max": 1.2575000524520874, "sampling/importance_sampling_ratio/mean": 0.7373183965682983, "sampling/importance_sampling_ratio/min": 0.2651961147785187, "sampling/sampling_logp_difference/max": 0.8941724300384521, "sampling/sampling_logp_difference/mean": 0.024856336414813995, "step": 366, "step_time": 61.59983134100912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3267011046409607, "epoch": 0.734, "frac_reward_zero_std": 0.0, "grad_norm": 1.0291807651519775, "kl": 0.0357687771320343, "learning_rate": 3.6601531199501715e-06, "loss": 0.0779, "num_tokens": 2041220.0, "reward": 0.33000001311302185, "reward_std": 0.5733025074005127, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5529143214225769, "sampling/importance_sampling_ratio/max": 1.1229214668273926, "sampling/importance_sampling_ratio/mean": 0.8919734954833984, "sampling/importance_sampling_ratio/min": 0.6937599778175354, "sampling/sampling_logp_difference/max": 0.5363889932632446, "sampling/sampling_logp_difference/mean": 0.029397767037153244, "step": 367, "step_time": 66.55807171101333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.4076734185218811, "epoch": 0.736, "frac_reward_zero_std": 0.0, "grad_norm": 1.5241880416870117, "kl": 0.04465167969465256, "learning_rate": 3.652974798899988e-06, "loss": -0.1161, "num_tokens": 2047319.0, "reward": 0.3499999940395355, "reward_std": 0.5534278154373169, "rewards/reward_func/mean": 0.3499999940395355, "rewards/reward_func/std": 0.5284478664398193, "sampling/importance_sampling_ratio/max": 1.7426992654800415, "sampling/importance_sampling_ratio/mean": 0.89775550365448, "sampling/importance_sampling_ratio/min": 0.49534907937049866, "sampling/sampling_logp_difference/max": 0.423353910446167, "sampling/sampling_logp_difference/mean": 0.027613524347543716, "step": 368, "step_time": 61.087412825989304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.31100600957870483, "epoch": 0.738, "frac_reward_zero_std": 0.0, "grad_norm": 1.9741218090057373, "kl": 0.041330281645059586, "learning_rate": 3.645784383689742e-06, "loss": -0.0427, "num_tokens": 2052270.0, "reward": 0.45249998569488525, "reward_std": 0.6035691499710083, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.5591256618499756, "sampling/importance_sampling_ratio/max": 1.8154767751693726, "sampling/importance_sampling_ratio/mean": 1.2792425155639648, "sampling/importance_sampling_ratio/min": 0.7800292372703552, "sampling/sampling_logp_difference/max": 0.3694136142730713, "sampling/sampling_logp_difference/mean": 0.022285200655460358, "step": 369, "step_time": 50.397062509000534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3526594042778015, "epoch": 0.74, "frac_reward_zero_std": 0.0, "grad_norm": 1.3253847360610962, "kl": 0.1274574100971222, "learning_rate": 3.6385819497434877e-06, "loss": -0.0685, "num_tokens": 2057269.0, "reward": 0.33375000953674316, "reward_std": 0.5655902028083801, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5493616461753845, "sampling/importance_sampling_ratio/max": 1.402198076248169, "sampling/importance_sampling_ratio/mean": 0.8689178824424744, "sampling/importance_sampling_ratio/min": 0.3067050278186798, "sampling/sampling_logp_difference/max": 0.9296143054962158, "sampling/sampling_logp_difference/mean": 0.026013534516096115, "step": 370, "step_time": 54.17731352400733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.32903480529785156, "epoch": 0.742, "frac_reward_zero_std": 0.0, "grad_norm": 1.6364682912826538, "kl": 0.044946420937776566, "learning_rate": 3.631367572611348e-06, "loss": -0.2922, "num_tokens": 2063722.0, "reward": 0.3500000238418579, "reward_std": 0.5495222806930542, "rewards/reward_func/mean": 0.3500000238418579, "rewards/reward_func/std": 0.5277445316314697, "sampling/importance_sampling_ratio/max": 1.5483042001724243, "sampling/importance_sampling_ratio/mean": 0.8218961954116821, "sampling/importance_sampling_ratio/min": 0.41721194982528687, "sampling/sampling_logp_difference/max": 0.5305330753326416, "sampling/sampling_logp_difference/mean": 0.028160959482192993, "step": 371, "step_time": 61.68605206900975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.31724613904953003, "epoch": 0.744, "frac_reward_zero_std": 0.0, "grad_norm": 1.3481601476669312, "kl": 0.06961038708686829, "learning_rate": 3.6241413279687256e-06, "loss": 0.2308, "num_tokens": 2069668.0, "reward": 0.20499999821186066, "reward_std": 0.3475438058376312, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.4931531250476837, "sampling/importance_sampling_ratio/max": 1.3638176918029785, "sampling/importance_sampling_ratio/mean": 0.7686522006988525, "sampling/importance_sampling_ratio/min": 0.18426480889320374, "sampling/sampling_logp_difference/max": 0.6221842765808105, "sampling/sampling_logp_difference/mean": 0.032336391508579254, "step": 372, "step_time": 73.30788465001388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.875, "completions/mean_terminated_length": 49.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.29763156175613403, "epoch": 0.746, "frac_reward_zero_std": 0.0, "grad_norm": 1.0755096673965454, "kl": 0.02788379229605198, "learning_rate": 3.616903291615506e-06, "loss": 0.0231, "num_tokens": 2074693.0, "reward": 0.3149999976158142, "reward_std": 0.5274383425712585, "rewards/reward_func/mean": 0.3149999976158142, "rewards/reward_func/std": 0.5020813345909119, "sampling/importance_sampling_ratio/max": 1.2642863988876343, "sampling/importance_sampling_ratio/mean": 0.8364578485488892, "sampling/importance_sampling_ratio/min": 0.37059077620506287, "sampling/sampling_logp_difference/max": 0.4319629669189453, "sampling/sampling_logp_difference/mean": 0.026207586750388145, "step": 373, "step_time": 66.48929881799268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35318872332572937, "epoch": 0.748, "frac_reward_zero_std": 0.0, "grad_norm": 1.5173914432525635, "kl": 0.024975910782814026, "learning_rate": 3.609653539475268e-06, "loss": -0.0445, "num_tokens": 2080341.0, "reward": 0.3137499988079071, "reward_std": 0.3301275372505188, "rewards/reward_func/mean": 0.3137499988079071, "rewards/reward_func/std": 0.5643184781074524, "sampling/importance_sampling_ratio/max": 1.3749171495437622, "sampling/importance_sampling_ratio/mean": 0.8896816372871399, "sampling/importance_sampling_ratio/min": 0.5193299651145935, "sampling/sampling_logp_difference/max": 0.5717992782592773, "sampling/sampling_logp_difference/mean": 0.030124176293611526, "step": 374, "step_time": 78.54219227202702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.36500370502471924, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 1.6961071491241455, "kl": 0.21644490957260132, "learning_rate": 3.6023921475944795e-06, "loss": 0.2398, "num_tokens": 2085708.0, "reward": 0.19875000417232513, "reward_std": 0.517146110534668, "rewards/reward_func/mean": 0.19875000417232513, "rewards/reward_func/std": 0.47908952832221985, "sampling/importance_sampling_ratio/max": 2.1243157386779785, "sampling/importance_sampling_ratio/mean": 0.9670206904411316, "sampling/importance_sampling_ratio/min": 0.29750171303749084, "sampling/sampling_logp_difference/max": 1.0457005500793457, "sampling/sampling_logp_difference/mean": 0.03371373564004898, "step": 375, "step_time": 66.68903090100503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.38407090306282043, "epoch": 0.752, "frac_reward_zero_std": 0.0, "grad_norm": 1.7444822788238525, "kl": 0.07864461094141006, "learning_rate": 3.5951191921417063e-06, "loss": -0.0054, "num_tokens": 2091007.0, "reward": 0.3712500035762787, "reward_std": 0.5389498472213745, "rewards/reward_func/mean": 0.3712500035762787, "rewards/reward_func/std": 0.5179474949836731, "sampling/importance_sampling_ratio/max": 1.3411577939987183, "sampling/importance_sampling_ratio/mean": 0.9035917520523071, "sampling/importance_sampling_ratio/min": 0.5722795128822327, "sampling/sampling_logp_difference/max": 0.6103904247283936, "sampling/sampling_logp_difference/mean": 0.04205818474292755, "step": 376, "step_time": 62.62372294199304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.375, "completions/mean_terminated_length": 45.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3571351170539856, "epoch": 0.754, "frac_reward_zero_std": 0.0, "grad_norm": 1.4393010139465332, "kl": 0.05092768371105194, "learning_rate": 3.5878347494068083e-06, "loss": 0.0737, "num_tokens": 2096885.0, "reward": -0.08124999701976776, "reward_std": 0.05413114279508591, "rewards/reward_func/mean": -0.08124999701976776, "rewards/reward_func/std": 0.05617256462574005, "sampling/importance_sampling_ratio/max": 1.5597022771835327, "sampling/importance_sampling_ratio/mean": 1.0888936519622803, "sampling/importance_sampling_ratio/min": 0.7159003019332886, "sampling/sampling_logp_difference/max": 0.7558160424232483, "sampling/sampling_logp_difference/mean": 0.028478611260652542, "step": 377, "step_time": 78.95239180698991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 44.125, "completions/mean_terminated_length": 44.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3541492819786072, "epoch": 0.756, "frac_reward_zero_std": 0.0, "grad_norm": 1.104987382888794, "kl": 0.06759263575077057, "learning_rate": 3.580538895800144e-06, "loss": -0.0204, "num_tokens": 2102217.0, "reward": 0.19624999165534973, "reward_std": 0.5378745794296265, "rewards/reward_func/mean": 0.19624999165534973, "rewards/reward_func/std": 0.49805158376693726, "sampling/importance_sampling_ratio/max": 0.9906109571456909, "sampling/importance_sampling_ratio/mean": 0.7715339660644531, "sampling/importance_sampling_ratio/min": 0.558393657207489, "sampling/sampling_logp_difference/max": 0.5296880006790161, "sampling/sampling_logp_difference/mean": 0.026829030364751816, "step": 378, "step_time": 68.67950404499425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3333103060722351, "epoch": 0.758, "frac_reward_zero_std": 0.0, "grad_norm": 1.306211233139038, "kl": 0.04187663272023201, "learning_rate": 3.573231707851765e-06, "loss": -0.0646, "num_tokens": 2108035.0, "reward": 0.4725000262260437, "reward_std": 0.5237306356430054, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.5537598729133606, "sampling/importance_sampling_ratio/max": 1.1231999397277832, "sampling/importance_sampling_ratio/mean": 0.8123407363891602, "sampling/importance_sampling_ratio/min": 0.6417423486709595, "sampling/sampling_logp_difference/max": 0.675841212272644, "sampling/sampling_logp_difference/mean": 0.029995568096637726, "step": 379, "step_time": 58.079839242011076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 42.125, "completions/mean_terminated_length": 42.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.33336013555526733, "epoch": 0.76, "frac_reward_zero_std": 0.0, "grad_norm": 1.535415530204773, "kl": 0.06656455248594284, "learning_rate": 3.5659132622106152e-06, "loss": -0.1762, "num_tokens": 2113701.0, "reward": 0.0637500062584877, "reward_std": 0.2779829502105713, "rewards/reward_func/mean": 0.0637500062584877, "rewards/reward_func/std": 0.37591552734375, "sampling/importance_sampling_ratio/max": 2.21850848197937, "sampling/importance_sampling_ratio/mean": 1.0035760402679443, "sampling/importance_sampling_ratio/min": 0.36623576283454895, "sampling/sampling_logp_difference/max": 0.5119132995605469, "sampling/sampling_logp_difference/mean": 0.03865154832601547, "step": 380, "step_time": 83.85681084700627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.34765076637268066, "epoch": 0.762, "frac_reward_zero_std": 0.0, "grad_norm": 1.4875462055206299, "kl": 0.05875123292207718, "learning_rate": 3.5585836356437266e-06, "loss": 0.0993, "num_tokens": 2118822.0, "reward": 0.05499999597668648, "reward_std": 0.28776630759239197, "rewards/reward_func/mean": 0.05499999597668648, "rewards/reward_func/std": 0.37928506731987, "sampling/importance_sampling_ratio/max": 2.0379128456115723, "sampling/importance_sampling_ratio/mean": 1.3014514446258545, "sampling/importance_sampling_ratio/min": 0.6343486309051514, "sampling/sampling_logp_difference/max": 0.4492349624633789, "sampling/sampling_logp_difference/mean": 0.026231329888105392, "step": 381, "step_time": 78.49495024702628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3074229955673218, "epoch": 0.764, "frac_reward_zero_std": 0.0, "grad_norm": 1.0164070129394531, "kl": 0.019709967076778412, "learning_rate": 3.551242905035412e-06, "loss": -0.1317, "num_tokens": 2125216.0, "reward": 0.08374999463558197, "reward_std": 0.2835198938846588, "rewards/reward_func/mean": 0.08374999463558197, "rewards/reward_func/std": 0.37217265367507935, "sampling/importance_sampling_ratio/max": 1.2361012697219849, "sampling/importance_sampling_ratio/mean": 0.9302610158920288, "sampling/importance_sampling_ratio/min": 0.6827021241188049, "sampling/sampling_logp_difference/max": 0.3573673963546753, "sampling/sampling_logp_difference/mean": 0.02304799109697342, "step": 382, "step_time": 84.3939113280212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.3727479577064514, "epoch": 0.766, "frac_reward_zero_std": 0.0, "grad_norm": 2.0571773052215576, "kl": 0.09017743915319443, "learning_rate": 3.5438911473864633e-06, "loss": -0.2174, "num_tokens": 2131334.0, "reward": 0.0962500050663948, "reward_std": 0.2652566134929657, "rewards/reward_func/mean": 0.0962500050663948, "rewards/reward_func/std": 0.35399505496025085, "sampling/importance_sampling_ratio/max": 2.516140937805176, "sampling/importance_sampling_ratio/mean": 1.00909423828125, "sampling/importance_sampling_ratio/min": 0.6436149477958679, "sampling/sampling_logp_difference/max": 0.5928447246551514, "sampling/sampling_logp_difference/mean": 0.03408171236515045, "step": 383, "step_time": 73.35666742300964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3380432724952698, "epoch": 0.768, "frac_reward_zero_std": 0.0, "grad_norm": 1.5185048580169678, "kl": 0.05075772851705551, "learning_rate": 3.5365284398133404e-06, "loss": 0.1199, "num_tokens": 2136480.0, "reward": 0.30000001192092896, "reward_std": 0.5517951250076294, "rewards/reward_func/mean": 0.30000001192092896, "rewards/reward_func/std": 0.5378262996673584, "sampling/importance_sampling_ratio/max": 2.533414125442505, "sampling/importance_sampling_ratio/mean": 0.987388014793396, "sampling/importance_sampling_ratio/min": 0.2914103865623474, "sampling/sampling_logp_difference/max": 0.4989492893218994, "sampling/sampling_logp_difference/mean": 0.03067699819803238, "step": 384, "step_time": 55.22710300600738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3242243528366089, "epoch": 0.77, "frac_reward_zero_std": 0.0, "grad_norm": 1.345138669013977, "kl": 0.039375655353069305, "learning_rate": 3.52915485954736e-06, "loss": 0.0886, "num_tokens": 2141751.0, "reward": 0.4362500011920929, "reward_std": 0.5983107686042786, "rewards/reward_func/mean": 0.4362500011920929, "rewards/reward_func/std": 0.5539711117744446, "sampling/importance_sampling_ratio/max": 1.7134405374526978, "sampling/importance_sampling_ratio/mean": 1.042137861251831, "sampling/importance_sampling_ratio/min": 0.4775417149066925, "sampling/sampling_logp_difference/max": 0.5579397678375244, "sampling/sampling_logp_difference/mean": 0.025659702718257904, "step": 385, "step_time": 67.54349041997921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 37.125, "completions/mean_terminated_length": 37.125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.33516746759414673, "epoch": 0.772, "frac_reward_zero_std": 0.0, "grad_norm": 2.208026170730591, "kl": 0.05478062853217125, "learning_rate": 3.521770483933891e-06, "loss": 0.2502, "num_tokens": 2146979.0, "reward": -0.0637499988079071, "reward_std": 0.04775945842266083, "rewards/reward_func/mean": -0.0637499988079071, "rewards/reward_func/std": 0.050409041345119476, "sampling/importance_sampling_ratio/max": 1.972628116607666, "sampling/importance_sampling_ratio/mean": 1.1597208976745605, "sampling/importance_sampling_ratio/min": 0.6736937165260315, "sampling/sampling_logp_difference/max": 0.4482576847076416, "sampling/sampling_logp_difference/mean": 0.027906980365514755, "step": 386, "step_time": 74.46757221099688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.37138980627059937, "epoch": 0.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.9289368391036987, "kl": 0.10337759554386139, "learning_rate": 3.514375390431539e-06, "loss": 0.172, "num_tokens": 2153373.0, "reward": 0.2900000214576721, "reward_std": 0.5930180549621582, "rewards/reward_func/mean": 0.2900000214576721, "rewards/reward_func/std": 0.5795319080352783, "sampling/importance_sampling_ratio/max": 1.536302924156189, "sampling/importance_sampling_ratio/mean": 0.7297533750534058, "sampling/importance_sampling_ratio/min": 0.289122611284256, "sampling/sampling_logp_difference/max": 0.8604832887649536, "sampling/sampling_logp_difference/mean": 0.034349218010902405, "step": 387, "step_time": 61.59288591900258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 44.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.4133331775665283, "epoch": 0.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.9855160713195801, "kl": 0.045814886689186096, "learning_rate": 3.5069696566113347e-06, "loss": 0.0904, "num_tokens": 2159078.0, "reward": 0.33500000834465027, "reward_std": 0.5581594705581665, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5354037284851074, "sampling/importance_sampling_ratio/max": 1.518318772315979, "sampling/importance_sampling_ratio/mean": 0.8139652609825134, "sampling/importance_sampling_ratio/min": 0.37126120924949646, "sampling/sampling_logp_difference/max": 0.5169713497161865, "sampling/sampling_logp_difference/mean": 0.03355231136083603, "step": 388, "step_time": 81.17526444801479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.875, "completions/mean_terminated_length": 35.875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3351132869720459, "epoch": 0.778, "frac_reward_zero_std": 0.0, "grad_norm": 1.4382963180541992, "kl": 0.047305479645729065, "learning_rate": 3.499553360155923e-06, "loss": 0.1196, "num_tokens": 2165109.0, "reward": 0.2150000035762787, "reward_std": 0.5208038091659546, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.48329228162765503, "sampling/importance_sampling_ratio/max": 1.9932667016983032, "sampling/importance_sampling_ratio/mean": 1.2677991390228271, "sampling/importance_sampling_ratio/min": 0.7389498949050903, "sampling/sampling_logp_difference/max": 0.46536529064178467, "sampling/sampling_logp_difference/mean": 0.027284495532512665, "step": 389, "step_time": 68.42802210498485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3567490577697754, "epoch": 0.78, "frac_reward_zero_std": 0.0, "grad_norm": 1.0806845426559448, "kl": 0.039206504821777344, "learning_rate": 3.4921265788587432e-06, "loss": -0.1312, "num_tokens": 2170662.0, "reward": 0.1899999976158142, "reward_std": 0.5110079050064087, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.4738294184207916, "sampling/importance_sampling_ratio/max": 1.3621087074279785, "sampling/importance_sampling_ratio/mean": 0.7618493437767029, "sampling/importance_sampling_ratio/min": 0.11757281422615051, "sampling/sampling_logp_difference/max": 0.7672085762023926, "sampling/sampling_logp_difference/mean": 0.029348157346248627, "step": 390, "step_time": 66.64952985799755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.348691463470459, "epoch": 0.782, "frac_reward_zero_std": 0.0, "grad_norm": 2.4213788509368896, "kl": 0.06088120490312576, "learning_rate": 3.484689390623218e-06, "loss": -0.2873, "num_tokens": 2176785.0, "reward": 0.3537500202655792, "reward_std": 0.5490626096725464, "rewards/reward_func/mean": 0.3537500202655792, "rewards/reward_func/std": 0.5272554159164429, "sampling/importance_sampling_ratio/max": 2.726332426071167, "sampling/importance_sampling_ratio/mean": 1.269676923751831, "sampling/importance_sampling_ratio/min": 0.4701959788799286, "sampling/sampling_logp_difference/max": 0.5951485633850098, "sampling/sampling_logp_difference/mean": 0.031430695205926895, "step": 391, "step_time": 64.82404442300322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.32772278785705566, "epoch": 0.784, "frac_reward_zero_std": 0.0, "grad_norm": 1.8402619361877441, "kl": 0.03273371234536171, "learning_rate": 3.4772418734619325e-06, "loss": 0.2288, "num_tokens": 2182195.0, "reward": 0.09624999761581421, "reward_std": 0.27131104469299316, "rewards/reward_func/mean": 0.09624999761581421, "rewards/reward_func/std": 0.3657844066619873, "sampling/importance_sampling_ratio/max": 2.0606470108032227, "sampling/importance_sampling_ratio/mean": 1.0602631568908691, "sampling/importance_sampling_ratio/min": 0.5077344179153442, "sampling/sampling_logp_difference/max": 0.41891008615493774, "sampling/sampling_logp_difference/mean": 0.028547827154397964, "step": 392, "step_time": 64.06525129399961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.32786738872528076, "epoch": 0.786, "frac_reward_zero_std": 0.0, "grad_norm": 1.5365052223205566, "kl": 0.044348280876874924, "learning_rate": 3.4697841054958163e-06, "loss": -0.1633, "num_tokens": 2188346.0, "reward": 0.36000001430511475, "reward_std": 0.5519298315048218, "rewards/reward_func/mean": 0.36000001430511475, "rewards/reward_func/std": 0.5301482677459717, "sampling/importance_sampling_ratio/max": 2.044487237930298, "sampling/importance_sampling_ratio/mean": 1.0870068073272705, "sampling/importance_sampling_ratio/min": 0.6400982141494751, "sampling/sampling_logp_difference/max": 0.7581937313079834, "sampling/sampling_logp_difference/mean": 0.02735818549990654, "step": 393, "step_time": 63.89243840900599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 41.625, "completions/mean_terminated_length": 41.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3156575858592987, "epoch": 0.788, "frac_reward_zero_std": 0.0, "grad_norm": 1.4243261814117432, "kl": 0.05710726976394653, "learning_rate": 3.4623161649533284e-06, "loss": -0.3008, "num_tokens": 2193765.0, "reward": 0.32749998569488525, "reward_std": 0.5432307720184326, "rewards/reward_func/mean": 0.32749998569488525, "rewards/reward_func/std": 0.5308685898780823, "sampling/importance_sampling_ratio/max": 2.2074787616729736, "sampling/importance_sampling_ratio/mean": 1.2915685176849365, "sampling/importance_sampling_ratio/min": 0.6163145303726196, "sampling/sampling_logp_difference/max": 0.40680623054504395, "sampling/sampling_logp_difference/mean": 0.02459460124373436, "step": 394, "step_time": 63.77927023899974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.357902467250824, "epoch": 0.79, "frac_reward_zero_std": 0.0, "grad_norm": 1.4730643033981323, "kl": 0.07591858506202698, "learning_rate": 3.4548381301696298e-06, "loss": 0.1483, "num_tokens": 2199321.0, "reward": -0.03375000134110451, "reward_std": 0.026678871363401413, "rewards/reward_func/mean": -0.03375000134110451, "rewards/reward_func/std": 0.025035688653588295, "sampling/importance_sampling_ratio/max": 2.484659433364868, "sampling/importance_sampling_ratio/mean": 1.1020760536193848, "sampling/importance_sampling_ratio/min": 0.20723672211170197, "sampling/sampling_logp_difference/max": 0.9204421043395996, "sampling/sampling_logp_difference/mean": 0.03480283543467522, "step": 395, "step_time": 72.94622380597866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.4134517312049866, "epoch": 0.792, "frac_reward_zero_std": 0.0, "grad_norm": 1.6859726905822754, "kl": 0.03517558425664902, "learning_rate": 3.4473500795857674e-06, "loss": -0.1951, "num_tokens": 2204573.0, "reward": 0.20875000953674316, "reward_std": 0.3220616579055786, "rewards/reward_func/mean": 0.20875000953674316, "rewards/reward_func/std": 0.47588828206062317, "sampling/importance_sampling_ratio/max": 1.7888513803482056, "sampling/importance_sampling_ratio/mean": 1.0129364728927612, "sampling/importance_sampling_ratio/min": 0.49311375617980957, "sampling/sampling_logp_difference/max": 0.5829896926879883, "sampling/sampling_logp_difference/mean": 0.03469054028391838, "step": 396, "step_time": 74.89418144299998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 44.125, "completions/mean_terminated_length": 44.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.30955445766448975, "epoch": 0.794, "frac_reward_zero_std": 0.0, "grad_norm": 1.5693347454071045, "kl": 0.04247763007879257, "learning_rate": 3.4398520917478478e-06, "loss": -0.0086, "num_tokens": 2210029.0, "reward": 0.08750000596046448, "reward_std": 0.2670246660709381, "rewards/reward_func/mean": 0.08750000596046448, "rewards/reward_func/std": 0.36958470940589905, "sampling/importance_sampling_ratio/max": 1.8365312814712524, "sampling/importance_sampling_ratio/mean": 1.170079231262207, "sampling/importance_sampling_ratio/min": 0.4552127420902252, "sampling/sampling_logp_difference/max": 0.3448103666305542, "sampling/sampling_logp_difference/mean": 0.02435469999909401, "step": 397, "step_time": 71.30152476101648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34962934255599976, "epoch": 0.796, "frac_reward_zero_std": 0.0, "grad_norm": 1.2815780639648438, "kl": 0.03365220129489899, "learning_rate": 3.4323442453062173e-06, "loss": 0.1781, "num_tokens": 2214891.0, "reward": 0.32499998807907104, "reward_std": 0.5809470415115356, "rewards/reward_func/mean": 0.32499998807907104, "rewards/reward_func/std": 0.5604844689369202, "sampling/importance_sampling_ratio/max": 1.4398008584976196, "sampling/importance_sampling_ratio/mean": 0.9877474308013916, "sampling/importance_sampling_ratio/min": 0.5344565510749817, "sampling/sampling_logp_difference/max": 0.38164573907852173, "sampling/sampling_logp_difference/mean": 0.02679327502846718, "step": 398, "step_time": 67.73330377798993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.30688661336898804, "epoch": 0.798, "frac_reward_zero_std": 0.0, "grad_norm": 1.5855470895767212, "kl": 0.0417325496673584, "learning_rate": 3.4248266190146307e-06, "loss": 0.004, "num_tokens": 2220361.0, "reward": 0.4762499928474426, "reward_std": 0.6048096418380737, "rewards/reward_func/mean": 0.4762499928474426, "rewards/reward_func/std": 0.560661256313324, "sampling/importance_sampling_ratio/max": 1.3558542728424072, "sampling/importance_sampling_ratio/mean": 1.0467090606689453, "sampling/importance_sampling_ratio/min": 0.8046448230743408, "sampling/sampling_logp_difference/max": 0.37460851669311523, "sampling/sampling_logp_difference/mean": 0.02754260040819645, "step": 399, "step_time": 63.321532267989824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.36169493198394775, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 2.4955718517303467, "kl": 0.05709603428840637, "learning_rate": 3.417299291729431e-06, "loss": -0.3635, "num_tokens": 2225385.0, "reward": 0.3387500047683716, "reward_std": 0.5678717494010925, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5412007570266724, "sampling/importance_sampling_ratio/max": 2.468524694442749, "sampling/importance_sampling_ratio/mean": 1.3570051193237305, "sampling/importance_sampling_ratio/min": 0.45365825295448303, "sampling/sampling_logp_difference/max": 0.5086992979049683, "sampling/sampling_logp_difference/mean": 0.028253626078367233, "step": 400, "step_time": 61.90352356500807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.34915873408317566, "epoch": 0.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.7310982346534729, "kl": 0.028745461255311966, "learning_rate": 3.4097623424087196e-06, "loss": -0.1418, "num_tokens": 2231023.0, "reward": 0.3375000059604645, "reward_std": 0.2833724915981293, "rewards/reward_func/mean": 0.3375000059604645, "rewards/reward_func/std": 0.5279272198677063, "sampling/importance_sampling_ratio/max": 1.532747745513916, "sampling/importance_sampling_ratio/mean": 0.7657254934310913, "sampling/importance_sampling_ratio/min": 0.29685893654823303, "sampling/sampling_logp_difference/max": 0.4246586561203003, "sampling/sampling_logp_difference/mean": 0.0307551771402359, "step": 401, "step_time": 82.38972250098595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 43.875, "completions/mean_terminated_length": 43.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3424833416938782, "epoch": 0.804, "frac_reward_zero_std": 0.0, "grad_norm": 1.250693917274475, "kl": 0.03432589769363403, "learning_rate": 3.4022158501115283e-06, "loss": -0.166, "num_tokens": 2237005.0, "reward": 0.21125000715255737, "reward_std": 0.312855988740921, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.4853699207305908, "sampling/importance_sampling_ratio/max": 1.5625203847885132, "sampling/importance_sampling_ratio/mean": 1.0789234638214111, "sampling/importance_sampling_ratio/min": 0.6180092096328735, "sampling/sampling_logp_difference/max": 0.35615015029907227, "sampling/sampling_logp_difference/mean": 0.026920361444354057, "step": 402, "step_time": 65.18766634300118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3446376919746399, "epoch": 0.806, "frac_reward_zero_std": 0.0, "grad_norm": 1.2360810041427612, "kl": 0.0331093966960907, "learning_rate": 3.39465989399699e-06, "loss": 0.16, "num_tokens": 2242114.0, "reward": 0.45625001192092896, "reward_std": 0.6050564050674438, "rewards/reward_func/mean": 0.45625001192092896, "rewards/reward_func/std": 0.5601769685745239, "sampling/importance_sampling_ratio/max": 2.0269222259521484, "sampling/importance_sampling_ratio/mean": 1.0755150318145752, "sampling/importance_sampling_ratio/min": 0.587192714214325, "sampling/sampling_logp_difference/max": 0.29895949363708496, "sampling/sampling_logp_difference/mean": 0.025805631652474403, "step": 403, "step_time": 54.9477135160123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 46.875, "completions/mean_terminated_length": 46.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3936536908149719, "epoch": 0.808, "frac_reward_zero_std": 0.0, "grad_norm": 2.0102906227111816, "kl": 0.059495870023965836, "learning_rate": 3.3870945533235104e-06, "loss": -0.0334, "num_tokens": 2247189.0, "reward": 0.19625000655651093, "reward_std": 0.5299696922302246, "rewards/reward_func/mean": 0.19625000655651093, "rewards/reward_func/std": 0.4908865690231323, "sampling/importance_sampling_ratio/max": 1.2130988836288452, "sampling/importance_sampling_ratio/mean": 0.9266165494918823, "sampling/importance_sampling_ratio/min": 0.7416336536407471, "sampling/sampling_logp_difference/max": 0.2876337766647339, "sampling/sampling_logp_difference/mean": 0.028024829924106598, "step": 404, "step_time": 71.07167344598565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.334345281124115, "epoch": 0.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.5419965982437134, "kl": 0.025859929621219635, "learning_rate": 3.3795199074479312e-06, "loss": -0.1101, "num_tokens": 2252252.0, "reward": 0.6000000238418579, "reward_std": 0.5447690486907959, "rewards/reward_func/mean": 0.6000000238418579, "rewards/reward_func/std": 0.5224940180778503, "sampling/importance_sampling_ratio/max": 1.4799644947052002, "sampling/importance_sampling_ratio/mean": 0.7123466730117798, "sampling/importance_sampling_ratio/min": 0.14022274315357208, "sampling/sampling_logp_difference/max": 0.7181998491287231, "sampling/sampling_logp_difference/mean": 0.026170939207077026, "step": 405, "step_time": 48.95189845201094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3260424733161926, "epoch": 0.812, "frac_reward_zero_std": 0.0, "grad_norm": 1.2206482887268066, "kl": 0.07319030910730362, "learning_rate": 3.3719360358247054e-06, "loss": -0.1504, "num_tokens": 2257641.0, "reward": 0.21000000834465027, "reward_std": 0.3072892427444458, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.47958314418792725, "sampling/importance_sampling_ratio/max": 1.4747114181518555, "sampling/importance_sampling_ratio/mean": 0.7886297702789307, "sampling/importance_sampling_ratio/min": 0.39762672781944275, "sampling/sampling_logp_difference/max": 0.3593275547027588, "sampling/sampling_logp_difference/mean": 0.02779657021164894, "step": 406, "step_time": 61.756067362002796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.341819167137146, "epoch": 0.814, "frac_reward_zero_std": 0.0, "grad_norm": 2.2889909744262695, "kl": 0.01969139650464058, "learning_rate": 3.3643430180050573e-06, "loss": 0.7006, "num_tokens": 2263126.0, "reward": 0.4150000214576721, "reward_std": 0.5482439398765564, "rewards/reward_func/mean": 0.4150000214576721, "rewards/reward_func/std": 0.5823842883110046, "sampling/importance_sampling_ratio/max": 2.7643935680389404, "sampling/importance_sampling_ratio/mean": 1.1803869009017944, "sampling/importance_sampling_ratio/min": 0.6169243454933167, "sampling/sampling_logp_difference/max": 0.3439610004425049, "sampling/sampling_logp_difference/mean": 0.025598403066396713, "step": 407, "step_time": 63.48746524998569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.32550209760665894, "epoch": 0.816, "frac_reward_zero_std": 0.0, "grad_norm": 1.5375040769577026, "kl": 0.02752833254635334, "learning_rate": 3.3567409336361502e-06, "loss": -0.0216, "num_tokens": 2268322.0, "reward": 0.19875000417232513, "reward_std": 0.30625003576278687, "rewards/reward_func/mean": 0.19875000417232513, "rewards/reward_func/std": 0.45642828941345215, "sampling/importance_sampling_ratio/max": 2.6713926792144775, "sampling/importance_sampling_ratio/mean": 1.2524373531341553, "sampling/importance_sampling_ratio/min": 0.6978874206542969, "sampling/sampling_logp_difference/max": 0.4598565101623535, "sampling/sampling_logp_difference/mean": 0.027718737721443176, "step": 408, "step_time": 67.29830334399594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.36724764108657837, "epoch": 0.818, "frac_reward_zero_std": 0.0, "grad_norm": 1.9681098461151123, "kl": 0.05620799958705902, "learning_rate": 3.3491298624602514e-06, "loss": -0.1462, "num_tokens": 2273479.0, "reward": 0.59375, "reward_std": 0.5756310224533081, "rewards/reward_func/mean": 0.59375, "rewards/reward_func/std": 0.5577746033668518, "sampling/importance_sampling_ratio/max": 2.058076858520508, "sampling/importance_sampling_ratio/mean": 1.0275644063949585, "sampling/importance_sampling_ratio/min": 0.5849137306213379, "sampling/sampling_logp_difference/max": 0.6340939998626709, "sampling/sampling_logp_difference/mean": 0.033282943069934845, "step": 409, "step_time": 67.91412261300138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 39.625, "completions/mean_terminated_length": 39.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3391873240470886, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 2.0897281169891357, "kl": 0.048063769936561584, "learning_rate": 3.3415098843138972e-06, "loss": -0.1435, "num_tokens": 2279337.0, "reward": 0.10750000923871994, "reward_std": 0.2595524489879608, "rewards/reward_func/mean": 0.10750000923871994, "rewards/reward_func/std": 0.35289618372917175, "sampling/importance_sampling_ratio/max": 2.8655083179473877, "sampling/importance_sampling_ratio/mean": 1.0282737016677856, "sampling/importance_sampling_ratio/min": 0.27357611060142517, "sampling/sampling_logp_difference/max": 0.4681780934333801, "sampling/sampling_logp_difference/mean": 0.03042689338326454, "step": 410, "step_time": 83.52543040498858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3415806293487549, "epoch": 0.822, "frac_reward_zero_std": 0.0, "grad_norm": 2.6372363567352295, "kl": 0.050147589296102524, "learning_rate": 3.333881079127052e-06, "loss": -0.3055, "num_tokens": 2284889.0, "reward": 0.21000000834465027, "reward_std": 0.5254905223846436, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.4876474142074585, "sampling/importance_sampling_ratio/max": 2.446664571762085, "sampling/importance_sampling_ratio/mean": 0.9917970895767212, "sampling/importance_sampling_ratio/min": 0.3648597002029419, "sampling/sampling_logp_difference/max": 0.5268797874450684, "sampling/sampling_logp_difference/mean": 0.022872356697916985, "step": 411, "step_time": 73.05385316698812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3670850694179535, "epoch": 0.824, "frac_reward_zero_std": 0.0, "grad_norm": 1.2050609588623047, "kl": 0.01863059774041176, "learning_rate": 3.326243526922272e-06, "loss": 0.1589, "num_tokens": 2290321.0, "reward": 0.19500000774860382, "reward_std": 0.3351808488368988, "rewards/reward_func/mean": 0.19500000774860382, "rewards/reward_func/std": 0.49318209290504456, "sampling/importance_sampling_ratio/max": 1.9619362354278564, "sampling/importance_sampling_ratio/mean": 0.9046612977981567, "sampling/importance_sampling_ratio/min": 0.5072652697563171, "sampling/sampling_logp_difference/max": 0.5272719860076904, "sampling/sampling_logp_difference/mean": 0.027489028871059418, "step": 412, "step_time": 92.1496964310063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3263223469257355, "epoch": 0.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.7847272157669067, "kl": 0.03273576870560646, "learning_rate": 3.3185973078138665e-06, "loss": 0.0545, "num_tokens": 2296019.0, "reward": 0.20125000178813934, "reward_std": 0.5348846316337585, "rewards/reward_func/mean": 0.20125000178813934, "rewards/reward_func/std": 0.4956507384777069, "sampling/importance_sampling_ratio/max": 1.2074002027511597, "sampling/importance_sampling_ratio/mean": 0.7049754858016968, "sampling/importance_sampling_ratio/min": 0.45924341678619385, "sampling/sampling_logp_difference/max": 0.36492061614990234, "sampling/sampling_logp_difference/mean": 0.029689345508813858, "step": 413, "step_time": 74.78980298401439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.34388232231140137, "epoch": 0.828, "frac_reward_zero_std": 0.0, "grad_norm": 1.1072615385055542, "kl": 0.043772339820861816, "learning_rate": 3.3109425020070564e-06, "loss": 0.2443, "num_tokens": 2301154.0, "reward": 0.15125000476837158, "reward_std": 0.5701001286506653, "rewards/reward_func/mean": 0.15125000476837158, "rewards/reward_func/std": 0.5283244848251343, "sampling/importance_sampling_ratio/max": 1.8563178777694702, "sampling/importance_sampling_ratio/mean": 0.9956398010253906, "sampling/importance_sampling_ratio/min": 0.31948381662368774, "sampling/sampling_logp_difference/max": 0.6346423625946045, "sampling/sampling_logp_difference/mean": 0.0296938456594944, "step": 414, "step_time": 71.1694306099962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.350239098072052, "epoch": 0.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.9788258075714111, "kl": 0.02329590916633606, "learning_rate": 3.3032791897971313e-06, "loss": 0.0095, "num_tokens": 2306595.0, "reward": 0.7325000166893005, "reward_std": 0.31240540742874146, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.4742136597633362, "sampling/importance_sampling_ratio/max": 1.4771513938903809, "sampling/importance_sampling_ratio/mean": 1.0924103260040283, "sampling/importance_sampling_ratio/min": 0.5864495038986206, "sampling/sampling_logp_difference/max": 0.35701167583465576, "sampling/sampling_logp_difference/mean": 0.022829465568065643, "step": 415, "step_time": 51.9381214719906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3615211248397827, "epoch": 0.832, "frac_reward_zero_std": 0.0, "grad_norm": 1.6271722316741943, "kl": 0.04001173749566078, "learning_rate": 3.2956074515686105e-06, "loss": 0.0667, "num_tokens": 2311738.0, "reward": 0.17375001311302185, "reward_std": 0.34344157576560974, "rewards/reward_func/mean": 0.17375001311302185, "rewards/reward_func/std": 0.49100297689437866, "sampling/importance_sampling_ratio/max": 2.3310201168060303, "sampling/importance_sampling_ratio/mean": 1.2795183658599854, "sampling/importance_sampling_ratio/min": 0.5067328214645386, "sampling/sampling_logp_difference/max": 0.495988130569458, "sampling/sampling_logp_difference/mean": 0.029406055808067322, "step": 416, "step_time": 78.19868917198619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3619380593299866, "epoch": 0.834, "frac_reward_zero_std": 0.0, "grad_norm": 1.2272183895111084, "kl": 0.04641294479370117, "learning_rate": 3.2879273677943972e-06, "loss": -0.0374, "num_tokens": 2317239.0, "reward": 0.4699999988079071, "reward_std": 0.5924452543258667, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5485044717788696, "sampling/importance_sampling_ratio/max": 1.6035174131393433, "sampling/importance_sampling_ratio/mean": 0.8214474320411682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5355191230773926, "sampling/sampling_logp_difference/mean": 0.031899720430374146, "step": 417, "step_time": 67.45473215699894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.40914639830589294, "epoch": 0.836, "frac_reward_zero_std": 0.0, "grad_norm": 1.7058494091033936, "kl": 0.04687212407588959, "learning_rate": 3.2802390190349364e-06, "loss": 0.1611, "num_tokens": 2323573.0, "reward": 0.33000001311302185, "reward_std": 0.5646458864212036, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5414794683456421, "sampling/importance_sampling_ratio/max": 2.929563522338867, "sampling/importance_sampling_ratio/mean": 1.1725656986236572, "sampling/importance_sampling_ratio/min": 0.3646584451198578, "sampling/sampling_logp_difference/max": 0.5263292789459229, "sampling/sampling_logp_difference/mean": 0.03212471306324005, "step": 418, "step_time": 73.99689673900139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.388424813747406, "epoch": 0.838, "frac_reward_zero_std": 0.0, "grad_norm": 1.2566641569137573, "kl": 0.040666110813617706, "learning_rate": 3.272542485937369e-06, "loss": 0.0819, "num_tokens": 2329180.0, "reward": 0.3449999988079071, "reward_std": 0.5655855536460876, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5425863862037659, "sampling/importance_sampling_ratio/max": 1.893444538116455, "sampling/importance_sampling_ratio/mean": 1.027420997619629, "sampling/importance_sampling_ratio/min": 0.41170260310173035, "sampling/sampling_logp_difference/max": 0.336561918258667, "sampling/sampling_logp_difference/mean": 0.02408537268638611, "step": 419, "step_time": 65.69701232301304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3684334456920624, "epoch": 0.84, "frac_reward_zero_std": 0.0, "grad_norm": 1.4962432384490967, "kl": 0.028718747198581696, "learning_rate": 3.264837849234685e-06, "loss": -0.1286, "num_tokens": 2335827.0, "reward": 0.3362500071525574, "reward_std": 0.5644514560699463, "rewards/reward_func/mean": 0.3362500071525574, "rewards/reward_func/std": 0.5412403345108032, "sampling/importance_sampling_ratio/max": 2.035574197769165, "sampling/importance_sampling_ratio/mean": 1.2553296089172363, "sampling/importance_sampling_ratio/min": 0.5002336502075195, "sampling/sampling_logp_difference/max": 0.2905765771865845, "sampling/sampling_logp_difference/mean": 0.02430140972137451, "step": 420, "step_time": 772.381344155001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 43.375, "completions/mean_terminated_length": 43.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.39142391085624695, "epoch": 0.842, "frac_reward_zero_std": 0.0, "grad_norm": 1.1592785120010376, "kl": 0.037018656730651855, "learning_rate": 3.257125189744877e-06, "loss": -0.101, "num_tokens": 2341291.0, "reward": 0.45375001430511475, "reward_std": 0.6169389486312866, "rewards/reward_func/mean": 0.45375001430511475, "rewards/reward_func/std": 0.571687638759613, "sampling/importance_sampling_ratio/max": 1.636826992034912, "sampling/importance_sampling_ratio/mean": 0.8795583248138428, "sampling/importance_sampling_ratio/min": 0.4447176456451416, "sampling/sampling_logp_difference/max": 0.5345578193664551, "sampling/sampling_logp_difference/mean": 0.024022206664085388, "step": 421, "step_time": 59.57560216600541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.34517401456832886, "epoch": 0.844, "frac_reward_zero_std": 0.0, "grad_norm": 2.165475606918335, "kl": 0.05213654413819313, "learning_rate": 3.249404588370095e-06, "loss": 0.1109, "num_tokens": 2346050.0, "reward": 0.3412500023841858, "reward_std": 0.5530316829681396, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.532713770866394, "sampling/importance_sampling_ratio/max": 2.421031951904297, "sampling/importance_sampling_ratio/mean": 1.4897425174713135, "sampling/importance_sampling_ratio/min": 0.8438997864723206, "sampling/sampling_logp_difference/max": 0.3565685749053955, "sampling/sampling_logp_difference/mean": 0.028397034853696823, "step": 422, "step_time": 53.3122301310068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3420068621635437, "epoch": 0.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.7624173760414124, "kl": 0.03828991949558258, "learning_rate": 3.2416761260957925e-06, "loss": 0.0449, "num_tokens": 2351785.0, "reward": 0.17999999225139618, "reward_std": 0.5343748331069946, "rewards/reward_func/mean": 0.17999999225139618, "rewards/reward_func/std": 0.49509018659591675, "sampling/importance_sampling_ratio/max": 0.8726930022239685, "sampling/importance_sampling_ratio/mean": 0.6684524416923523, "sampling/importance_sampling_ratio/min": 0.5162980556488037, "sampling/sampling_logp_difference/max": 0.7039575576782227, "sampling/sampling_logp_difference/mean": 0.023529747501015663, "step": 423, "step_time": 71.27505167000345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3805294632911682, "epoch": 0.848, "frac_reward_zero_std": 0.0, "grad_norm": 1.6922258138656616, "kl": 0.04322695732116699, "learning_rate": 3.233939883989882e-06, "loss": 0.1443, "num_tokens": 2357558.0, "reward": 0.3450000286102295, "reward_std": 0.24957968294620514, "rewards/reward_func/mean": 0.3450000286102295, "rewards/reward_func/std": 0.5098739862442017, "sampling/importance_sampling_ratio/max": 2.186922788619995, "sampling/importance_sampling_ratio/mean": 1.163309097290039, "sampling/importance_sampling_ratio/min": 0.4379298686981201, "sampling/sampling_logp_difference/max": 0.7527205944061279, "sampling/sampling_logp_difference/mean": 0.030846048146486282, "step": 424, "step_time": 52.987184192985296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.380068302154541, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.7612060308456421, "kl": 0.03679168224334717, "learning_rate": 3.2261959432018834e-06, "loss": -0.0798, "num_tokens": 2362976.0, "reward": 0.32499998807907104, "reward_std": 0.5652101039886475, "rewards/reward_func/mean": 0.32499998807907104, "rewards/reward_func/std": 0.5401322841644287, "sampling/importance_sampling_ratio/max": 1.0401641130447388, "sampling/importance_sampling_ratio/mean": 0.5586026906967163, "sampling/importance_sampling_ratio/min": 0.27103391289711, "sampling/sampling_logp_difference/max": 0.5478124618530273, "sampling/sampling_logp_difference/mean": 0.03913367539644241, "step": 425, "step_time": 76.52260100099375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3354133069515228, "epoch": 0.852, "frac_reward_zero_std": 0.0, "grad_norm": 1.4465264081954956, "kl": 0.02433839999139309, "learning_rate": 3.218444384962071e-06, "loss": -0.404, "num_tokens": 2368735.0, "reward": 0.21250000596046448, "reward_std": 0.32150566577911377, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.48443636298179626, "sampling/importance_sampling_ratio/max": 1.766361951828003, "sampling/importance_sampling_ratio/mean": 0.968987226486206, "sampling/importance_sampling_ratio/min": 0.29835739731788635, "sampling/sampling_logp_difference/max": 0.7882108688354492, "sampling/sampling_logp_difference/mean": 0.029678575694561005, "step": 426, "step_time": 68.1356228920049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.33666664361953735, "epoch": 0.854, "frac_reward_zero_std": 0.0, "grad_norm": 1.9550222158432007, "kl": 0.0564693845808506, "learning_rate": 3.210685290580622e-06, "loss": -0.0499, "num_tokens": 2373721.0, "reward": 0.3474999964237213, "reward_std": 0.5655620098114014, "rewards/reward_func/mean": 0.3474999964237213, "rewards/reward_func/std": 0.5410770177841187, "sampling/importance_sampling_ratio/max": 1.442548155784607, "sampling/importance_sampling_ratio/mean": 0.9848485589027405, "sampling/importance_sampling_ratio/min": 0.6200289726257324, "sampling/sampling_logp_difference/max": 0.4186820983886719, "sampling/sampling_logp_difference/mean": 0.02865251712501049, "step": 427, "step_time": 46.70685623100144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.36010992527008057, "epoch": 0.856, "frac_reward_zero_std": 0.0, "grad_norm": 1.5365159511566162, "kl": 0.03516993671655655, "learning_rate": 3.2029187414467645e-06, "loss": -0.1816, "num_tokens": 2379614.0, "reward": 0.22500000894069672, "reward_std": 0.3162981867790222, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.47952359914779663, "sampling/importance_sampling_ratio/max": 1.5209850072860718, "sampling/importance_sampling_ratio/mean": 1.071610689163208, "sampling/importance_sampling_ratio/min": 0.45961546897888184, "sampling/sampling_logp_difference/max": 0.6144394874572754, "sampling/sampling_logp_difference/mean": 0.033051151782274246, "step": 428, "step_time": 65.01070052201976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34195345640182495, "epoch": 0.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.953159749507904, "kl": 0.032611116766929626, "learning_rate": 3.1951448190279256e-06, "loss": 0.0294, "num_tokens": 2385361.0, "reward": 0.09000000357627869, "reward_std": 0.2679736018180847, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.3642212748527527, "sampling/importance_sampling_ratio/max": 1.2745250463485718, "sampling/importance_sampling_ratio/mean": 0.9374400973320007, "sampling/importance_sampling_ratio/min": 0.4317379891872406, "sampling/sampling_logp_difference/max": 0.44930171966552734, "sampling/sampling_logp_difference/mean": 0.02398090809583664, "step": 429, "step_time": 63.217384373012464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3918173611164093, "epoch": 0.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.7345211505889893, "kl": 0.02356931008398533, "learning_rate": 3.1873636048688714e-06, "loss": 0.0721, "num_tokens": 2390785.0, "reward": 0.2150000035762787, "reward_std": 0.5217581987380981, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.4830853343009949, "sampling/importance_sampling_ratio/max": 1.0560283660888672, "sampling/importance_sampling_ratio/mean": 0.7125515937805176, "sampling/importance_sampling_ratio/min": 0.24447228014469147, "sampling/sampling_logp_difference/max": 1.1061149835586548, "sampling/sampling_logp_difference/mean": 0.02791447564959526, "step": 430, "step_time": 69.20054752400029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3796185851097107, "epoch": 0.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.9570589661598206, "kl": 0.03291895240545273, "learning_rate": 3.1795751805908578e-06, "loss": -0.0766, "num_tokens": 2396141.0, "reward": 0.3125, "reward_std": 0.5881974697113037, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5682240724563599, "sampling/importance_sampling_ratio/max": 1.2188318967819214, "sampling/importance_sampling_ratio/mean": 0.7393078207969666, "sampling/importance_sampling_ratio/min": 0.3626616597175598, "sampling/sampling_logp_difference/max": 0.3476827144622803, "sampling/sampling_logp_difference/mean": 0.029474619776010513, "step": 431, "step_time": 65.4366222230019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 42.875, "completions/mean_terminated_length": 42.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3385878801345825, "epoch": 0.864, "frac_reward_zero_std": 0.0, "grad_norm": 1.5139399766921997, "kl": 0.041907161474227905, "learning_rate": 3.171779627890769e-06, "loss": 0.1129, "num_tokens": 2400741.0, "reward": 0.1899999976158142, "reward_std": 0.3400847911834717, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.4960990846157074, "sampling/importance_sampling_ratio/max": 1.7639198303222656, "sampling/importance_sampling_ratio/mean": 1.0898196697235107, "sampling/importance_sampling_ratio/min": 0.6894667148590088, "sampling/sampling_logp_difference/max": 0.5719653367996216, "sampling/sampling_logp_difference/mean": 0.028854355216026306, "step": 432, "step_time": 46.04387527299696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34956616163253784, "epoch": 0.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.9688817262649536, "kl": 0.02931622788310051, "learning_rate": 3.1639770285402632e-06, "loss": 0.1477, "num_tokens": 2405893.0, "reward": 0.04375000298023224, "reward_std": 0.3060930669307709, "rewards/reward_func/mean": 0.04375000298023224, "rewards/reward_func/std": 0.39467665553092957, "sampling/importance_sampling_ratio/max": 1.536496639251709, "sampling/importance_sampling_ratio/mean": 1.0786614418029785, "sampling/importance_sampling_ratio/min": 0.7137445211410522, "sampling/sampling_logp_difference/max": 0.34972822666168213, "sampling/sampling_logp_difference/mean": 0.02250964567065239, "step": 433, "step_time": 69.64978085900657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.36804884672164917, "epoch": 0.868, "frac_reward_zero_std": 0.0, "grad_norm": 1.0465831756591797, "kl": 0.034811001271009445, "learning_rate": 3.1561674643849173e-06, "loss": -0.1412, "num_tokens": 2411564.0, "reward": 0.10000000149011612, "reward_std": 0.26890355348587036, "rewards/reward_func/mean": 0.10000000149011612, "rewards/reward_func/std": 0.3607531785964966, "sampling/importance_sampling_ratio/max": 2.2357981204986572, "sampling/importance_sampling_ratio/mean": 0.8856201767921448, "sampling/importance_sampling_ratio/min": 0.4398100972175598, "sampling/sampling_logp_difference/max": 0.648827075958252, "sampling/sampling_logp_difference/mean": 0.02694147266447544, "step": 434, "step_time": 78.29400004900526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3648771643638611, "epoch": 0.87, "frac_reward_zero_std": 0.0, "grad_norm": 1.5500693321228027, "kl": 0.049925297498703, "learning_rate": 3.148351017343363e-06, "loss": 0.0987, "num_tokens": 2418201.0, "reward": 0.21875, "reward_std": 0.5047336220741272, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.46759071946144104, "sampling/importance_sampling_ratio/max": 2.2972049713134766, "sampling/importance_sampling_ratio/mean": 1.2422068119049072, "sampling/importance_sampling_ratio/min": 0.5480000376701355, "sampling/sampling_logp_difference/max": 0.49748849868774414, "sampling/sampling_logp_difference/mean": 0.03015657514333725, "step": 435, "step_time": 80.38836584499222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.875, "completions/mean_terminated_length": 45.875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3652074337005615, "epoch": 0.872, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921752214431763, "kl": 0.0575464591383934, "learning_rate": 3.1405277694064306e-06, "loss": -0.2442, "num_tokens": 2423760.0, "reward": 0.20124998688697815, "reward_std": 0.32401105761528015, "rewards/reward_func/mean": 0.20124998688697815, "rewards/reward_func/std": 0.49380266666412354, "sampling/importance_sampling_ratio/max": 1.8371641635894775, "sampling/importance_sampling_ratio/mean": 1.028379201889038, "sampling/importance_sampling_ratio/min": 0.34969013929367065, "sampling/sampling_logp_difference/max": 0.47838956117630005, "sampling/sampling_logp_difference/mean": 0.028822563588619232, "step": 436, "step_time": 111.30371044500498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3787916898727417, "epoch": 0.874, "frac_reward_zero_std": 0.0, "grad_norm": 1.373725414276123, "kl": 0.029538137838244438, "learning_rate": 3.1326978026362907e-06, "loss": -0.1174, "num_tokens": 2429732.0, "reward": 0.1912499964237213, "reward_std": 0.32099446654319763, "rewards/reward_func/mean": 0.1912499964237213, "rewards/reward_func/std": 0.4616256058216095, "sampling/importance_sampling_ratio/max": 1.5630745887756348, "sampling/importance_sampling_ratio/mean": 1.042021632194519, "sampling/importance_sampling_ratio/min": 0.4928293526172638, "sampling/sampling_logp_difference/max": 0.35456085205078125, "sampling/sampling_logp_difference/mean": 0.02736075408756733, "step": 437, "step_time": 104.65821277699433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35759854316711426, "epoch": 0.876, "frac_reward_zero_std": 0.0, "grad_norm": 1.3202944993972778, "kl": 0.040575023740530014, "learning_rate": 3.1248611991655885e-06, "loss": 0.0142, "num_tokens": 2435583.0, "reward": 0.2237500101327896, "reward_std": 0.3150855600833893, "rewards/reward_func/mean": 0.2237500101327896, "rewards/reward_func/std": 0.4777906835079193, "sampling/importance_sampling_ratio/max": 1.4550713300704956, "sampling/importance_sampling_ratio/mean": 0.7871130108833313, "sampling/importance_sampling_ratio/min": 0.40031906962394714, "sampling/sampling_logp_difference/max": 0.5306928157806396, "sampling/sampling_logp_difference/mean": 0.028820747509598732, "step": 438, "step_time": 81.86951732399757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3563615679740906, "epoch": 0.878, "frac_reward_zero_std": 0.0, "grad_norm": 1.279760718345642, "kl": 0.03050798550248146, "learning_rate": 3.1170180411965854e-06, "loss": -0.1991, "num_tokens": 2442392.0, "reward": 0.36250001192092896, "reward_std": 0.5422559976577759, "rewards/reward_func/mean": 0.36250001192092896, "rewards/reward_func/std": 0.519855797290802, "sampling/importance_sampling_ratio/max": 1.3513309955596924, "sampling/importance_sampling_ratio/mean": 0.7875853776931763, "sampling/importance_sampling_ratio/min": 0.474069207906723, "sampling/sampling_logp_difference/max": 0.5970335006713867, "sampling/sampling_logp_difference/mean": 0.027284270152449608, "step": 439, "step_time": 78.53958937400603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.36696097254753113, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 1.3010104894638062, "kl": 0.10381826758384705, "learning_rate": 3.109168411000299e-06, "loss": 0.0224, "num_tokens": 2447245.0, "reward": 0.4625000059604645, "reward_std": 0.5960428714752197, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.5521063804626465, "sampling/importance_sampling_ratio/max": 1.707277536392212, "sampling/importance_sampling_ratio/mean": 0.8685052394866943, "sampling/importance_sampling_ratio/min": 0.19099442660808563, "sampling/sampling_logp_difference/max": 1.2762131690979004, "sampling/sampling_logp_difference/mean": 0.029757626354694366, "step": 440, "step_time": 63.11404897898319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3573892414569855, "epoch": 0.882, "frac_reward_zero_std": 0.0, "grad_norm": 1.6774803400039673, "kl": 0.05356031656265259, "learning_rate": 3.1013123909156347e-06, "loss": -0.0621, "num_tokens": 2452150.0, "reward": 0.3125, "reward_std": 0.5504498481750488, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5366496443748474, "sampling/importance_sampling_ratio/max": 2.0549330711364746, "sampling/importance_sampling_ratio/mean": 1.1193873882293701, "sampling/importance_sampling_ratio/min": 0.4323776066303253, "sampling/sampling_logp_difference/max": 0.3989245891571045, "sampling/sampling_logp_difference/mean": 0.025564704090356827, "step": 441, "step_time": 76.97689333499875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.30483388900756836, "epoch": 0.884, "frac_reward_zero_std": 0.0, "grad_norm": 1.4331544637680054, "kl": 0.02398044988512993, "learning_rate": 3.093450063348525e-06, "loss": 0.364, "num_tokens": 2457723.0, "reward": 0.07000000029802322, "reward_std": 0.2906396687030792, "rewards/reward_func/mean": 0.07000000029802322, "rewards/reward_func/std": 0.37815341353416443, "sampling/importance_sampling_ratio/max": 2.091522216796875, "sampling/importance_sampling_ratio/mean": 1.1620619297027588, "sampling/importance_sampling_ratio/min": 0.6012184023857117, "sampling/sampling_logp_difference/max": 0.30550384521484375, "sampling/sampling_logp_difference/mean": 0.022237438708543777, "step": 442, "step_time": 95.15003140000044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34086257219314575, "epoch": 0.886, "frac_reward_zero_std": 0.0, "grad_norm": 1.1229441165924072, "kl": 0.01796545460820198, "learning_rate": 3.085581510771067e-06, "loss": -0.0669, "num_tokens": 2462560.0, "reward": 0.3512499928474426, "reward_std": 0.5547357797622681, "rewards/reward_func/mean": 0.3512499928474426, "rewards/reward_func/std": 0.5352286100387573, "sampling/importance_sampling_ratio/max": 1.212695837020874, "sampling/importance_sampling_ratio/mean": 0.912885844707489, "sampling/importance_sampling_ratio/min": 0.4976806342601776, "sampling/sampling_logp_difference/max": 0.33936166763305664, "sampling/sampling_logp_difference/mean": 0.02151985839009285, "step": 443, "step_time": 56.16717357101152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34066683053970337, "epoch": 0.888, "frac_reward_zero_std": 0.0, "grad_norm": 1.1225097179412842, "kl": 0.029933886602520943, "learning_rate": 3.0777068157206535e-06, "loss": 0.0719, "num_tokens": 2468388.0, "reward": 0.1899999976158142, "reward_std": 0.5411940813064575, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.5013980269432068, "sampling/importance_sampling_ratio/max": 1.6062737703323364, "sampling/importance_sampling_ratio/mean": 0.7600141167640686, "sampling/importance_sampling_ratio/min": 0.24852630496025085, "sampling/sampling_logp_difference/max": 0.6514277458190918, "sampling/sampling_logp_difference/mean": 0.025996115058660507, "step": 444, "step_time": 86.21614760000375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3731077015399933, "epoch": 0.89, "frac_reward_zero_std": 0.0, "grad_norm": 2.1878068447113037, "kl": 0.029690194875001907, "learning_rate": 3.0698260607991094e-06, "loss": -0.1014, "num_tokens": 2473364.0, "reward": 0.21875, "reward_std": 0.5235260725021362, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.4851638376712799, "sampling/importance_sampling_ratio/max": 1.600021481513977, "sampling/importance_sampling_ratio/mean": 0.9576125144958496, "sampling/importance_sampling_ratio/min": 0.5527809858322144, "sampling/sampling_logp_difference/max": 0.35846877098083496, "sampling/sampling_logp_difference/mean": 0.02683459408581257, "step": 445, "step_time": 66.11640017299214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.36752647161483765, "epoch": 0.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.9450146555900574, "kl": 0.0277615524828434, "learning_rate": 3.061939328671824e-06, "loss": 0.1718, "num_tokens": 2478775.0, "reward": 0.3125, "reward_std": 0.5557467341423035, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5402578711509705, "sampling/importance_sampling_ratio/max": 1.8847589492797852, "sampling/importance_sampling_ratio/mean": 0.9969915747642517, "sampling/importance_sampling_ratio/min": 0.4349041283130646, "sampling/sampling_logp_difference/max": 0.40544378757476807, "sampling/sampling_logp_difference/mean": 0.02645990625023842, "step": 446, "step_time": 93.85621070399066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 44.375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.35863834619522095, "epoch": 0.894, "frac_reward_zero_std": 0.0, "grad_norm": 1.6648696660995483, "kl": 0.03483325242996216, "learning_rate": 3.054046702066886e-06, "loss": 0.1038, "num_tokens": 2484436.0, "reward": 0.5862500071525574, "reward_std": 0.5568501353263855, "rewards/reward_func/mean": 0.5862500071525574, "rewards/reward_func/std": 0.5336917042732239, "sampling/importance_sampling_ratio/max": 2.2653186321258545, "sampling/importance_sampling_ratio/mean": 1.2341718673706055, "sampling/importance_sampling_ratio/min": 0.42728522419929504, "sampling/sampling_logp_difference/max": 0.7648518085479736, "sampling/sampling_logp_difference/mean": 0.025960015133023262, "step": 447, "step_time": 76.71857017299044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3564414083957672, "epoch": 0.896, "frac_reward_zero_std": 0.0, "grad_norm": 1.1071441173553467, "kl": 0.02148180454969406, "learning_rate": 3.0461482637742133e-06, "loss": 0.0403, "num_tokens": 2490437.0, "reward": 0.32875001430511475, "reward_std": 0.569521427154541, "rewards/reward_func/mean": 0.32875001430511475, "rewards/reward_func/std": 0.547577440738678, "sampling/importance_sampling_ratio/max": 1.057119607925415, "sampling/importance_sampling_ratio/mean": 0.8925005197525024, "sampling/importance_sampling_ratio/min": 0.7844187021255493, "sampling/sampling_logp_difference/max": 0.3507990837097168, "sampling/sampling_logp_difference/mean": 0.026810673996806145, "step": 448, "step_time": 82.088118226995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3908958435058594, "epoch": 0.898, "frac_reward_zero_std": 0.0, "grad_norm": 1.2796645164489746, "kl": 0.027592984959483147, "learning_rate": 3.0382440966446876e-06, "loss": -0.1308, "num_tokens": 2496183.0, "reward": 0.07874999940395355, "reward_std": 0.2891866862773895, "rewards/reward_func/mean": 0.07874999940395355, "rewards/reward_func/std": 0.37215349078178406, "sampling/importance_sampling_ratio/max": 1.3071595430374146, "sampling/importance_sampling_ratio/mean": 0.8888345956802368, "sampling/importance_sampling_ratio/min": 0.33864834904670715, "sampling/sampling_logp_difference/max": 0.3511269688606262, "sampling/sampling_logp_difference/mean": 0.03274238109588623, "step": 449, "step_time": 70.07518242698279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 58.875, "completions/mean_terminated_length": 58.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.35438233613967896, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 1.3290492296218872, "kl": 0.013173183426260948, "learning_rate": 3.0303342835892804e-06, "loss": -0.041, "num_tokens": 2502767.0, "reward": 0.23125000298023224, "reward_std": 0.5127817392349243, "rewards/reward_func/mean": 0.23125000298023224, "rewards/reward_func/std": 0.4747461676597595, "sampling/importance_sampling_ratio/max": 2.287126302719116, "sampling/importance_sampling_ratio/mean": 1.1372334957122803, "sampling/importance_sampling_ratio/min": 0.29354920983314514, "sampling/sampling_logp_difference/max": 0.5065096616744995, "sampling/sampling_logp_difference/mean": 0.025254379957914352, "step": 450, "step_time": 73.5957540590025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 43.125, "completions/mean_terminated_length": 43.125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.36208608746528625, "epoch": 0.902, "frac_reward_zero_std": 0.0, "grad_norm": 1.2244484424591064, "kl": 0.024516358971595764, "learning_rate": 3.0224189075781886e-06, "loss": -0.0166, "num_tokens": 2509154.0, "reward": 0.20374999940395355, "reward_std": 0.524694561958313, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.48582589626312256, "sampling/importance_sampling_ratio/max": 1.417374610900879, "sampling/importance_sampling_ratio/mean": 0.9122079610824585, "sampling/importance_sampling_ratio/min": 0.3931795656681061, "sampling/sampling_logp_difference/max": 0.4949922561645508, "sampling/sampling_logp_difference/mean": 0.0227043554186821, "step": 451, "step_time": 78.99534437101102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 42.375, "completions/mean_terminated_length": 42.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.39959007501602173, "epoch": 0.904, "frac_reward_zero_std": 0.0, "grad_norm": 1.0420317649841309, "kl": 0.019628848880529404, "learning_rate": 3.014498051639959e-06, "loss": 0.1187, "num_tokens": 2514772.0, "reward": -0.05375000089406967, "reward_std": 0.0541689358651638, "rewards/reward_func/mean": -0.05375000089406967, "rewards/reward_func/std": 0.05705573782324791, "sampling/importance_sampling_ratio/max": 1.0508769750595093, "sampling/importance_sampling_ratio/mean": 0.7083895802497864, "sampling/importance_sampling_ratio/min": 0.29281339049339294, "sampling/sampling_logp_difference/max": 0.9839637279510498, "sampling/sampling_logp_difference/mean": 0.03381787985563278, "step": 452, "step_time": 81.53623366498505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.367666631937027, "epoch": 0.906, "frac_reward_zero_std": 0.0, "grad_norm": 1.8917317390441895, "kl": 0.022898491472005844, "learning_rate": 3.006571798860626e-06, "loss": 0.0868, "num_tokens": 2519920.0, "reward": 0.45625001192092896, "reward_std": 0.6168291568756104, "rewards/reward_func/mean": 0.45625001192092896, "rewards/reward_func/std": 0.5718875527381897, "sampling/importance_sampling_ratio/max": 2.3027987480163574, "sampling/importance_sampling_ratio/mean": 1.5321813821792603, "sampling/importance_sampling_ratio/min": 0.91518634557724, "sampling/sampling_logp_difference/max": 0.6675479412078857, "sampling/sampling_logp_difference/mean": 0.02647707611322403, "step": 453, "step_time": 71.51700325298589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3597128987312317, "epoch": 0.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.8695248365402222, "kl": 0.044961053878068924, "learning_rate": 2.9986402323828274e-06, "loss": 0.0217, "num_tokens": 2525228.0, "reward": 0.32124999165534973, "reward_std": 0.551304042339325, "rewards/reward_func/mean": 0.32124999165534973, "rewards/reward_func/std": 0.5287569761276245, "sampling/importance_sampling_ratio/max": 1.051468014717102, "sampling/importance_sampling_ratio/mean": 0.6988952159881592, "sampling/importance_sampling_ratio/min": 0.27730298042297363, "sampling/sampling_logp_difference/max": 0.5294761657714844, "sampling/sampling_logp_difference/mean": 0.03408505767583847, "step": 454, "step_time": 66.79093995000585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.40276795625686646, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 1.8840203285217285, "kl": 0.020732712000608444, "learning_rate": 2.9907034354049443e-06, "loss": -0.206, "num_tokens": 2530621.0, "reward": 0.22499999403953552, "reward_std": 0.5174823999404907, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.4791063070297241, "sampling/importance_sampling_ratio/max": 1.370976448059082, "sampling/importance_sampling_ratio/mean": 0.9440828561782837, "sampling/importance_sampling_ratio/min": 0.7049600481987, "sampling/sampling_logp_difference/max": 0.33017855882644653, "sampling/sampling_logp_difference/mean": 0.02502366527915001, "step": 455, "step_time": 65.70268553399364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3440622091293335, "epoch": 0.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.9514747262001038, "kl": 0.02538667805492878, "learning_rate": 2.9827614911802205e-06, "loss": -0.2967, "num_tokens": 2536636.0, "reward": 0.5924999713897705, "reward_std": 0.5447898507118225, "rewards/reward_func/mean": 0.5924999713897705, "rewards/reward_func/std": 0.5242614150047302, "sampling/importance_sampling_ratio/max": 1.4159477949142456, "sampling/importance_sampling_ratio/mean": 0.8564717769622803, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6149642467498779, "sampling/sampling_logp_difference/mean": 0.02475292794406414, "step": 456, "step_time": 60.81095889999415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.36642664670944214, "epoch": 0.914, "frac_reward_zero_std": 0.0, "grad_norm": 1.3120555877685547, "kl": 0.024605643004179, "learning_rate": 2.9748144830158925e-06, "loss": 0.1126, "num_tokens": 2542012.0, "reward": 0.4650000035762787, "reward_std": 0.6151281595230103, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.5700375437736511, "sampling/importance_sampling_ratio/max": 1.2174410820007324, "sampling/importance_sampling_ratio/mean": 0.8987510204315186, "sampling/importance_sampling_ratio/min": 0.4412446916103363, "sampling/sampling_logp_difference/max": 0.48370981216430664, "sampling/sampling_logp_difference/mean": 0.02678913250565529, "step": 457, "step_time": 51.764286594989244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.375, "completions/mean_terminated_length": 45.375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.39495640993118286, "epoch": 0.916, "frac_reward_zero_std": 0.0, "grad_norm": 1.3088325262069702, "kl": 0.03151445835828781, "learning_rate": 2.966862494272316e-06, "loss": -0.0101, "num_tokens": 2547545.0, "reward": 0.3399999737739563, "reward_std": 0.554172158241272, "rewards/reward_func/mean": 0.3399999737739563, "rewards/reward_func/std": 0.530336856842041, "sampling/importance_sampling_ratio/max": 1.727379560470581, "sampling/importance_sampling_ratio/mean": 0.9480923414230347, "sampling/importance_sampling_ratio/min": 0.2979666590690613, "sampling/sampling_logp_difference/max": 0.5283234119415283, "sampling/sampling_logp_difference/mean": 0.02785215526819229, "step": 458, "step_time": 67.89937086799182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.33337390422821045, "epoch": 0.918, "frac_reward_zero_std": 0.0, "grad_norm": 1.1922571659088135, "kl": 0.03265373408794403, "learning_rate": 2.9589056083620902e-06, "loss": -0.1628, "num_tokens": 2552724.0, "reward": 0.4387500286102295, "reward_std": 0.6136727333068848, "rewards/reward_func/mean": 0.4387500286102295, "rewards/reward_func/std": 0.5688695311546326, "sampling/importance_sampling_ratio/max": 1.5377517938613892, "sampling/importance_sampling_ratio/mean": 0.7701914310455322, "sampling/importance_sampling_ratio/min": 0.3763391673564911, "sampling/sampling_logp_difference/max": 0.8605606555938721, "sampling/sampling_logp_difference/mean": 0.026637043803930283, "step": 459, "step_time": 63.45414676400833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3421512842178345, "epoch": 0.92, "frac_reward_zero_std": 0.0, "grad_norm": 1.6250663995742798, "kl": 0.02636205032467842, "learning_rate": 2.9509439087491837e-06, "loss": 0.0988, "num_tokens": 2558037.0, "reward": -0.07374999672174454, "reward_std": 0.04552318900823593, "rewards/reward_func/mean": -0.07374999672174454, "rewards/reward_func/std": 0.050691645592451096, "sampling/importance_sampling_ratio/max": 2.0454702377319336, "sampling/importance_sampling_ratio/mean": 1.0612456798553467, "sampling/importance_sampling_ratio/min": 0.6895912885665894, "sampling/sampling_logp_difference/max": 0.5679692029953003, "sampling/sampling_logp_difference/mean": 0.023974724113941193, "step": 460, "step_time": 86.26930091198301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.36159491539001465, "epoch": 0.922, "frac_reward_zero_std": 0.0, "grad_norm": 1.6231329441070557, "kl": 0.03310780972242355, "learning_rate": 2.9429774789480576e-06, "loss": 0.0836, "num_tokens": 2562984.0, "reward": 0.33000001311302185, "reward_std": 0.5563285946846008, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5400264263153076, "sampling/importance_sampling_ratio/max": 1.5695173740386963, "sampling/importance_sampling_ratio/mean": 1.1787632703781128, "sampling/importance_sampling_ratio/min": 0.7933380007743835, "sampling/sampling_logp_difference/max": 0.5508012771606445, "sampling/sampling_logp_difference/mean": 0.028630632907152176, "step": 461, "step_time": 44.234594836016186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.36595630645751953, "epoch": 0.924, "frac_reward_zero_std": 0.0, "grad_norm": 1.5493158102035522, "kl": 0.018796313554048538, "learning_rate": 2.93500640252279e-06, "loss": -0.1754, "num_tokens": 2568343.0, "reward": 0.4675000309944153, "reward_std": 0.6093506813049316, "rewards/reward_func/mean": 0.4675000309944153, "rewards/reward_func/std": 0.564212441444397, "sampling/importance_sampling_ratio/max": 1.9704557657241821, "sampling/importance_sampling_ratio/mean": 1.0828232765197754, "sampling/importance_sampling_ratio/min": 0.3847387135028839, "sampling/sampling_logp_difference/max": 0.30640411376953125, "sampling/sampling_logp_difference/mean": 0.025095216929912567, "step": 462, "step_time": 58.50289387899102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3116268217563629, "epoch": 0.926, "frac_reward_zero_std": 0.0, "grad_norm": 1.1552741527557373, "kl": 0.02773866429924965, "learning_rate": 2.927030763086201e-06, "loss": -0.3653, "num_tokens": 2573304.0, "reward": 0.6000000238418579, "reward_std": 0.5449049472808838, "rewards/reward_func/mean": 0.6000000238418579, "rewards/reward_func/std": 0.5248673558235168, "sampling/importance_sampling_ratio/max": 1.798938512802124, "sampling/importance_sampling_ratio/mean": 1.0102436542510986, "sampling/importance_sampling_ratio/min": 0.319669634103775, "sampling/sampling_logp_difference/max": 0.40699052810668945, "sampling/sampling_logp_difference/mean": 0.027081940323114395, "step": 463, "step_time": 57.92807800701121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3475750684738159, "epoch": 0.928, "frac_reward_zero_std": 0.0, "grad_norm": 1.0931487083435059, "kl": 0.01362108439207077, "learning_rate": 2.9190506442989753e-06, "loss": 0.0808, "num_tokens": 2578554.0, "reward": 0.08124999701976776, "reward_std": 0.2719267010688782, "rewards/reward_func/mean": 0.08124999701976776, "rewards/reward_func/std": 0.36041396856307983, "sampling/importance_sampling_ratio/max": 1.1853911876678467, "sampling/importance_sampling_ratio/mean": 0.9154686331748962, "sampling/importance_sampling_ratio/min": 0.48412805795669556, "sampling/sampling_logp_difference/max": 0.6382970809936523, "sampling/sampling_logp_difference/mean": 0.022256169468164444, "step": 464, "step_time": 61.422123302007094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 42.125, "completions/mean_terminated_length": 42.125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.307975709438324, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 1.3115994930267334, "kl": 0.033101074397563934, "learning_rate": 2.9110661298687824e-06, "loss": -0.0603, "num_tokens": 2583778.0, "reward": 0.45875000953674316, "reward_std": 0.6054055690765381, "rewards/reward_func/mean": 0.45875000953674316, "rewards/reward_func/std": 0.5611579418182373, "sampling/importance_sampling_ratio/max": 1.275829792022705, "sampling/importance_sampling_ratio/mean": 0.8946892619132996, "sampling/importance_sampling_ratio/min": 0.5616273283958435, "sampling/sampling_logp_difference/max": 0.654704213142395, "sampling/sampling_logp_difference/mean": 0.02203410118818283, "step": 465, "step_time": 61.31393297199975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35501527786254883, "epoch": 0.932, "frac_reward_zero_std": 0.0, "grad_norm": 2.040752649307251, "kl": 0.03077756240963936, "learning_rate": 2.9030773035493997e-06, "loss": 0.2758, "num_tokens": 2589204.0, "reward": 0.3149999976158142, "reward_std": 0.5430054664611816, "rewards/reward_func/mean": 0.3149999976158142, "rewards/reward_func/std": 0.5299056172370911, "sampling/importance_sampling_ratio/max": 2.528179883956909, "sampling/importance_sampling_ratio/mean": 1.2432548999786377, "sampling/importance_sampling_ratio/min": 0.5364408493041992, "sampling/sampling_logp_difference/max": 0.34423089027404785, "sampling/sampling_logp_difference/mean": 0.027653541415929794, "step": 466, "step_time": 61.11274787300499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3300337493419647, "epoch": 0.934, "frac_reward_zero_std": 0.0, "grad_norm": 1.0090454816818237, "kl": 0.02721918746829033, "learning_rate": 2.8950842491398358e-06, "loss": -0.0327, "num_tokens": 2595236.0, "reward": 0.22374999523162842, "reward_std": 0.5187286734580994, "rewards/reward_func/mean": 0.22374999523162842, "rewards/reward_func/std": 0.4808307886123657, "sampling/importance_sampling_ratio/max": 1.447536587715149, "sampling/importance_sampling_ratio/mean": 0.9794137477874756, "sampling/importance_sampling_ratio/min": 0.46334025263786316, "sampling/sampling_logp_difference/max": 0.3176230192184448, "sampling/sampling_logp_difference/mean": 0.022374983876943588, "step": 467, "step_time": 76.51882786100032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.34252166748046875, "epoch": 0.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.9320221543312073, "kl": 0.018017075955867767, "learning_rate": 2.8870870504834497e-06, "loss": -0.1157, "num_tokens": 2600730.0, "reward": 0.07999999821186066, "reward_std": 0.2839134931564331, "rewards/reward_func/mean": 0.07999999821186066, "rewards/reward_func/std": 0.3744710385799408, "sampling/importance_sampling_ratio/max": 2.2560055255889893, "sampling/importance_sampling_ratio/mean": 0.970880925655365, "sampling/importance_sampling_ratio/min": 0.39924535155296326, "sampling/sampling_logp_difference/max": 0.4781172275543213, "sampling/sampling_logp_difference/mean": 0.025780895724892616, "step": 468, "step_time": 65.44530803800444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3617730140686035, "epoch": 0.938, "frac_reward_zero_std": 0.0, "grad_norm": 0.8858951926231384, "kl": 0.030386239290237427, "learning_rate": 2.87908579146707e-06, "loss": 0.1291, "num_tokens": 2606113.0, "reward": 0.21875, "reward_std": 0.5210141539573669, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.48238804936408997, "sampling/importance_sampling_ratio/max": 1.4669780731201172, "sampling/importance_sampling_ratio/mean": 0.900518536567688, "sampling/importance_sampling_ratio/min": 0.4941990077495575, "sampling/sampling_logp_difference/max": 0.39029061794281006, "sampling/sampling_logp_difference/mean": 0.022767363116145134, "step": 469, "step_time": 64.19338588201208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.34387028217315674, "epoch": 0.94, "frac_reward_zero_std": 0.0, "grad_norm": 1.0154716968536377, "kl": 0.02230965718626976, "learning_rate": 2.8710805560201184e-06, "loss": -0.1484, "num_tokens": 2611738.0, "reward": 0.6075000166893005, "reward_std": 0.5653331279754639, "rewards/reward_func/mean": 0.6075000166893005, "rewards/reward_func/std": 0.5418421030044556, "sampling/importance_sampling_ratio/max": 1.161827564239502, "sampling/importance_sampling_ratio/mean": 0.8859966397285461, "sampling/importance_sampling_ratio/min": 0.28873908519744873, "sampling/sampling_logp_difference/max": 0.34857702255249023, "sampling/sampling_logp_difference/mean": 0.025210872292518616, "step": 470, "step_time": 60.65404006501194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3458770513534546, "epoch": 0.942, "frac_reward_zero_std": 0.0, "grad_norm": 1.0485503673553467, "kl": 0.01893254555761814, "learning_rate": 2.8630714281137263e-06, "loss": 0.3028, "num_tokens": 2617938.0, "reward": 0.18000000715255737, "reward_std": 0.3153059482574463, "rewards/reward_func/mean": 0.18000000715255737, "rewards/reward_func/std": 0.49796730279922485, "sampling/importance_sampling_ratio/max": 1.7840207815170288, "sampling/importance_sampling_ratio/mean": 1.1260120868682861, "sampling/importance_sampling_ratio/min": 0.7038984298706055, "sampling/sampling_logp_difference/max": 0.36597251892089844, "sampling/sampling_logp_difference/mean": 0.02206684835255146, "step": 471, "step_time": 64.57444148999639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.4065213203430176, "epoch": 0.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.7479361891746521, "kl": 0.022909866645932198, "learning_rate": 2.8550584917598558e-06, "loss": 0.0759, "num_tokens": 2624135.0, "reward": 0.07874999940395355, "reward_std": 0.26868927478790283, "rewards/reward_func/mean": 0.07874999940395355, "rewards/reward_func/std": 0.36490458250045776, "sampling/importance_sampling_ratio/max": 1.3143762350082397, "sampling/importance_sampling_ratio/mean": 0.7255112528800964, "sampling/importance_sampling_ratio/min": 0.27511295676231384, "sampling/sampling_logp_difference/max": 0.46601831912994385, "sampling/sampling_logp_difference/mean": 0.031219232827425003, "step": 472, "step_time": 83.27192145001027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3374762535095215, "epoch": 0.946, "frac_reward_zero_std": 0.0, "grad_norm": 1.2723208665847778, "kl": 0.022902309894561768, "learning_rate": 2.8470418310104175e-06, "loss": -0.2609, "num_tokens": 2629832.0, "reward": 0.0625, "reward_std": 0.24701336026191711, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.31998884677886963, "sampling/importance_sampling_ratio/max": 2.4061381816864014, "sampling/importance_sampling_ratio/mean": 1.0160009860992432, "sampling/importance_sampling_ratio/min": 0.5389451384544373, "sampling/sampling_logp_difference/max": 0.5744847059249878, "sampling/sampling_logp_difference/mean": 0.028143033385276794, "step": 473, "step_time": 73.6639021729934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3563547730445862, "epoch": 0.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.9397950172424316, "kl": 0.022011034190654755, "learning_rate": 2.839021529956388e-06, "loss": 0.0807, "num_tokens": 2635568.0, "reward": 0.21124999225139618, "reward_std": 0.527900218963623, "rewards/reward_func/mean": 0.21124999225139618, "rewards/reward_func/std": 0.48894748091697693, "sampling/importance_sampling_ratio/max": 1.040662169456482, "sampling/importance_sampling_ratio/mean": 0.7214508652687073, "sampling/importance_sampling_ratio/min": 0.3372233211994171, "sampling/sampling_logp_difference/max": 0.45850083231925964, "sampling/sampling_logp_difference/mean": 0.02489865943789482, "step": 474, "step_time": 75.28077130601741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35087794065475464, "epoch": 0.95, "frac_reward_zero_std": 0.0, "grad_norm": 1.5176308155059814, "kl": 0.017775265499949455, "learning_rate": 2.8309976727269335e-06, "loss": 0.2178, "num_tokens": 2641222.0, "reward": 0.06499999761581421, "reward_std": 0.2761574685573578, "rewards/reward_func/mean": 0.06499999761581421, "rewards/reward_func/std": 0.37939804792404175, "sampling/importance_sampling_ratio/max": 1.7761144638061523, "sampling/importance_sampling_ratio/mean": 0.925238847732544, "sampling/importance_sampling_ratio/min": 0.3248174488544464, "sampling/sampling_logp_difference/max": 0.6076414585113525, "sampling/sampling_logp_difference/mean": 0.02674829587340355, "step": 475, "step_time": 72.01986586101702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2991679906845093, "epoch": 0.952, "frac_reward_zero_std": 0.0, "grad_norm": 1.390182614326477, "kl": 0.020573535934090614, "learning_rate": 2.8229703434885165e-06, "loss": -0.0348, "num_tokens": 2646859.0, "reward": 0.33250001072883606, "reward_std": 0.5396865606307983, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.517707884311676, "sampling/importance_sampling_ratio/max": 1.602697730064392, "sampling/importance_sampling_ratio/mean": 0.9728833436965942, "sampling/importance_sampling_ratio/min": 0.48104777932167053, "sampling/sampling_logp_difference/max": 0.6165962219238281, "sampling/sampling_logp_difference/mean": 0.023785192519426346, "step": 476, "step_time": 77.24283880199073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.3583204448223114, "epoch": 0.954, "frac_reward_zero_std": 0.0, "grad_norm": 1.142444133758545, "kl": 0.015169752761721611, "learning_rate": 2.814939626444023e-06, "loss": -0.0124, "num_tokens": 2652207.0, "reward": 0.21375000476837158, "reward_std": 0.5112752914428711, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.473495751619339, "sampling/importance_sampling_ratio/max": 1.9258140325546265, "sampling/importance_sampling_ratio/mean": 1.1217129230499268, "sampling/importance_sampling_ratio/min": 0.8287367820739746, "sampling/sampling_logp_difference/max": 0.3338189125061035, "sampling/sampling_logp_difference/mean": 0.022958340123295784, "step": 477, "step_time": 61.92764704397996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.35365796089172363, "epoch": 0.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.9897144436836243, "kl": 0.01738031394779682, "learning_rate": 2.8069056058318754e-06, "loss": 0.0097, "num_tokens": 2658227.0, "reward": 0.20500001311302185, "reward_std": 0.5114267468452454, "rewards/reward_func/mean": 0.20500001311302185, "rewards/reward_func/std": 0.47416090965270996, "sampling/importance_sampling_ratio/max": 2.027554750442505, "sampling/importance_sampling_ratio/mean": 1.0578957796096802, "sampling/importance_sampling_ratio/min": 0.6797005534172058, "sampling/sampling_logp_difference/max": 0.5563673973083496, "sampling/sampling_logp_difference/mean": 0.023688018321990967, "step": 478, "step_time": 75.16944676099229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3513268828392029, "epoch": 0.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.9755890965461731, "kl": 0.017329782247543335, "learning_rate": 2.7988683659251475e-06, "loss": -0.0194, "num_tokens": 2663497.0, "reward": 0.10375000536441803, "reward_std": 0.2672772705554962, "rewards/reward_func/mean": 0.10375000536441803, "rewards/reward_func/std": 0.3627646863460541, "sampling/importance_sampling_ratio/max": 1.2677541971206665, "sampling/importance_sampling_ratio/mean": 0.7190382480621338, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48462724685668945, "sampling/sampling_logp_difference/mean": 0.024744585156440735, "step": 479, "step_time": 76.88298930699239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.3445096015930176, "epoch": 0.96, "frac_reward_zero_std": 0.0, "grad_norm": 1.1778706312179565, "kl": 0.01838690973818302, "learning_rate": 2.7908279910306834e-06, "loss": 0.0279, "num_tokens": 2669579.0, "reward": 0.33125001192092896, "reward_std": 0.580742359161377, "rewards/reward_func/mean": 0.33125001192092896, "rewards/reward_func/std": 0.5531064867973328, "sampling/importance_sampling_ratio/max": 1.4015132188796997, "sampling/importance_sampling_ratio/mean": 0.9845772385597229, "sampling/importance_sampling_ratio/min": 0.40269726514816284, "sampling/sampling_logp_difference/max": 0.5609352588653564, "sampling/sampling_logp_difference/mean": 0.02638828381896019, "step": 480, "step_time": 77.41213980599423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3905279338359833, "epoch": 0.962, "frac_reward_zero_std": 0.0, "grad_norm": 1.2462440729141235, "kl": 0.032538220286369324, "learning_rate": 2.7827845654882112e-06, "loss": -0.0404, "num_tokens": 2675067.0, "reward": 0.07124999910593033, "reward_std": 0.2798381745815277, "rewards/reward_func/mean": 0.07124999910593033, "rewards/reward_func/std": 0.3645520806312561, "sampling/importance_sampling_ratio/max": 1.2897956371307373, "sampling/importance_sampling_ratio/mean": 0.8797916173934937, "sampling/importance_sampling_ratio/min": 0.4773842692375183, "sampling/sampling_logp_difference/max": 0.3575262427330017, "sampling/sampling_logp_difference/mean": 0.02661733888089657, "step": 481, "step_time": 87.84593146201223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.3507845997810364, "epoch": 0.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.9909263849258423, "kl": 0.024185102432966232, "learning_rate": 2.7747381736694573e-06, "loss": 0.0312, "num_tokens": 2680053.0, "reward": 0.3400000035762787, "reward_std": 0.5569354891777039, "rewards/reward_func/mean": 0.3400000035762787, "rewards/reward_func/std": 0.5335326790809631, "sampling/importance_sampling_ratio/max": 1.2050838470458984, "sampling/importance_sampling_ratio/mean": 0.8115805387496948, "sampling/importance_sampling_ratio/min": 0.21530668437480927, "sampling/sampling_logp_difference/max": 0.41031479835510254, "sampling/sampling_logp_difference/mean": 0.02810395136475563, "step": 482, "step_time": 71.15641420998145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.32739341259002686, "epoch": 0.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.7815512418746948, "kl": 0.01184194814413786, "learning_rate": 2.766688899977266e-06, "loss": -0.1201, "num_tokens": 2685381.0, "reward": 0.06624999642372131, "reward_std": 0.29767611622810364, "rewards/reward_func/mean": 0.06624999642372131, "rewards/reward_func/std": 0.38037341833114624, "sampling/importance_sampling_ratio/max": 1.4397385120391846, "sampling/importance_sampling_ratio/mean": 0.828331470489502, "sampling/importance_sampling_ratio/min": 0.38339871168136597, "sampling/sampling_logp_difference/max": 0.5013303756713867, "sampling/sampling_logp_difference/mean": 0.020246436819434166, "step": 483, "step_time": 81.71323890099302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3554234504699707, "epoch": 0.968, "frac_reward_zero_std": 0.0, "grad_norm": 1.4028494358062744, "kl": 0.018699366599321365, "learning_rate": 2.7586368288447094e-06, "loss": -0.095, "num_tokens": 2690901.0, "reward": -0.06499999761581421, "reward_std": 0.048902880400419235, "rewards/reward_func/mean": -0.06499999761581421, "rewards/reward_func/std": 0.05554920434951782, "sampling/importance_sampling_ratio/max": 2.3254799842834473, "sampling/importance_sampling_ratio/mean": 1.0947003364562988, "sampling/importance_sampling_ratio/min": 0.5614188313484192, "sampling/sampling_logp_difference/max": 0.3719151020050049, "sampling/sampling_logp_difference/mean": 0.024835357442498207, "step": 484, "step_time": 80.53098407998914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 43.875, "completions/mean_terminated_length": 43.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.35123682022094727, "epoch": 0.97, "frac_reward_zero_std": 0.0, "grad_norm": 1.1409413814544678, "kl": 0.026009801775217056, "learning_rate": 2.750582044734203e-06, "loss": -0.1372, "num_tokens": 2696449.0, "reward": 0.3449999988079071, "reward_std": 0.5669803619384766, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5428759455680847, "sampling/importance_sampling_ratio/max": 1.3252332210540771, "sampling/importance_sampling_ratio/mean": 0.7124192714691162, "sampling/importance_sampling_ratio/min": 0.3038400709629059, "sampling/sampling_logp_difference/max": 0.3864710330963135, "sampling/sampling_logp_difference/mean": 0.02677079290151596, "step": 485, "step_time": 51.5313537089969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.37189728021621704, "epoch": 0.972, "frac_reward_zero_std": 0.0, "grad_norm": 1.553479552268982, "kl": 0.01384771429002285, "learning_rate": 2.7425246321366205e-06, "loss": -0.1355, "num_tokens": 2702672.0, "reward": -0.05000000074505806, "reward_std": 0.03972514346241951, "rewards/reward_func/mean": -0.05000000074505806, "rewards/reward_func/std": 0.04105745255947113, "sampling/importance_sampling_ratio/max": 2.5754904747009277, "sampling/importance_sampling_ratio/mean": 1.1340928077697754, "sampling/importance_sampling_ratio/min": 0.5375442504882812, "sampling/sampling_logp_difference/max": 0.47546517848968506, "sampling/sampling_logp_difference/mean": 0.02835531160235405, "step": 486, "step_time": 90.47422146701138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.38735634088516235, "epoch": 0.974, "frac_reward_zero_std": 0.0, "grad_norm": 1.343964695930481, "kl": 0.02191627398133278, "learning_rate": 2.7344646755704078e-06, "loss": 0.1007, "num_tokens": 2708209.0, "reward": 0.0637499988079071, "reward_std": 0.3018624186515808, "rewards/reward_func/mean": 0.0637499988079071, "rewards/reward_func/std": 0.3814610242843628, "sampling/importance_sampling_ratio/max": 1.4456323385238647, "sampling/importance_sampling_ratio/mean": 0.9162258505821228, "sampling/importance_sampling_ratio/min": 0.4561156928539276, "sampling/sampling_logp_difference/max": 0.5119402408599854, "sampling/sampling_logp_difference/mean": 0.028758030384778976, "step": 487, "step_time": 92.49751431899494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3179323077201843, "epoch": 0.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.9897328615188599, "kl": 0.04169601947069168, "learning_rate": 2.726402259580695e-06, "loss": 0.0601, "num_tokens": 2713886.0, "reward": 0.33500000834465027, "reward_std": 0.2701554596424103, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5461553931236267, "sampling/importance_sampling_ratio/max": 1.5669019222259521, "sampling/importance_sampling_ratio/mean": 0.9279680252075195, "sampling/importance_sampling_ratio/min": 0.5139185786247253, "sampling/sampling_logp_difference/max": 0.6310797929763794, "sampling/sampling_logp_difference/mean": 0.023750916123390198, "step": 488, "step_time": 49.78819806300453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.33032259345054626, "epoch": 0.978, "frac_reward_zero_std": 0.0, "grad_norm": 0.6407569050788879, "kl": 0.02176561951637268, "learning_rate": 2.71833746873841e-06, "loss": -0.0798, "num_tokens": 2718931.0, "reward": 0.44875001907348633, "reward_std": 0.5220805406570435, "rewards/reward_func/mean": 0.44875001907348633, "rewards/reward_func/std": 0.5664535760879517, "sampling/importance_sampling_ratio/max": 1.1846626996994019, "sampling/importance_sampling_ratio/mean": 0.8170421123504639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2127480506896973, "sampling/sampling_logp_difference/mean": 0.020459800958633423, "step": 489, "step_time": 71.78667046700139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3335261344909668, "epoch": 0.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.9051764011383057, "kl": 0.027933314442634583, "learning_rate": 2.7102703876393942e-06, "loss": 0.03, "num_tokens": 2723945.0, "reward": 0.20000000298023224, "reward_std": 0.5354849100112915, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.49638697504997253, "sampling/importance_sampling_ratio/max": 1.6904243230819702, "sampling/importance_sampling_ratio/mean": 0.8453304767608643, "sampling/importance_sampling_ratio/min": 0.358101487159729, "sampling/sampling_logp_difference/max": 0.6200103759765625, "sampling/sampling_logp_difference/mean": 0.019122183322906494, "step": 490, "step_time": 73.55681845199433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.34773534536361694, "epoch": 0.982, "frac_reward_zero_std": 0.0, "grad_norm": 1.5184320211410522, "kl": 0.029565483331680298, "learning_rate": 2.702201100903511e-06, "loss": 0.2018, "num_tokens": 2730051.0, "reward": 0.36625000834465027, "reward_std": 0.5476330518722534, "rewards/reward_func/mean": 0.36625000834465027, "rewards/reward_func/std": 0.5249200463294983, "sampling/importance_sampling_ratio/max": 1.7158502340316772, "sampling/importance_sampling_ratio/mean": 0.9216998815536499, "sampling/importance_sampling_ratio/min": 0.45285990834236145, "sampling/sampling_logp_difference/max": 0.6381690502166748, "sampling/sampling_logp_difference/mean": 0.027182936668395996, "step": 491, "step_time": 56.38132729998324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.46129417419433594, "epoch": 0.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.9216625690460205, "kl": 0.029165223240852356, "learning_rate": 2.694129693173759e-06, "loss": -0.0598, "num_tokens": 2735276.0, "reward": 0.4675000309944153, "reward_std": 0.5070215463638306, "rewards/reward_func/mean": 0.4675000309944153, "rewards/reward_func/std": 0.5406543612480164, "sampling/importance_sampling_ratio/max": 1.0167001485824585, "sampling/importance_sampling_ratio/mean": 0.7158698439598083, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7142941951751709, "sampling/sampling_logp_difference/mean": 0.027627810835838318, "step": 492, "step_time": 76.63889957600622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3078922629356384, "epoch": 0.986, "frac_reward_zero_std": 0.0, "grad_norm": 1.0626574754714966, "kl": 0.023861799389123917, "learning_rate": 2.6860562491153854e-06, "loss": -0.11, "num_tokens": 2740801.0, "reward": 0.2199999988079071, "reward_std": 0.5209156274795532, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.4826415479183197, "sampling/importance_sampling_ratio/max": 1.2552741765975952, "sampling/importance_sampling_ratio/mean": 0.95084547996521, "sampling/importance_sampling_ratio/min": 0.46218550205230713, "sampling/sampling_logp_difference/max": 0.9027338027954102, "sampling/sampling_logp_difference/mean": 0.023295089602470398, "step": 493, "step_time": 62.630216120014666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.33969372510910034, "epoch": 0.988, "frac_reward_zero_std": 0.0, "grad_norm": 1.0334094762802124, "kl": 0.021102532744407654, "learning_rate": 2.6779808534149986e-06, "loss": 0.0949, "num_tokens": 2746644.0, "reward": 0.09999999403953552, "reward_std": 0.2607312798500061, "rewards/reward_func/mean": 0.09999999403953552, "rewards/reward_func/std": 0.3642212748527527, "sampling/importance_sampling_ratio/max": 1.5536633729934692, "sampling/importance_sampling_ratio/mean": 0.9165699481964111, "sampling/importance_sampling_ratio/min": 0.5814899802207947, "sampling/sampling_logp_difference/max": 0.7713108062744141, "sampling/sampling_logp_difference/mean": 0.022944262251257896, "step": 494, "step_time": 97.2119018859812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3434738516807556, "epoch": 0.99, "frac_reward_zero_std": 0.0, "grad_norm": 1.3791193962097168, "kl": 0.025687772780656815, "learning_rate": 2.6699035907796796e-06, "loss": 0.2039, "num_tokens": 2752279.0, "reward": 0.20499999821186066, "reward_std": 0.5269919633865356, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.4880281090736389, "sampling/importance_sampling_ratio/max": 1.5326544046401978, "sampling/importance_sampling_ratio/mean": 1.1358022689819336, "sampling/importance_sampling_ratio/min": 0.7314006090164185, "sampling/sampling_logp_difference/max": 0.5475611686706543, "sampling/sampling_logp_difference/mean": 0.031097054481506348, "step": 495, "step_time": 91.44340489199385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3790108859539032, "epoch": 0.992, "frac_reward_zero_std": 0.0, "grad_norm": 1.1053639650344849, "kl": 0.021480565890669823, "learning_rate": 2.6618245459360896e-06, "loss": -0.2536, "num_tokens": 2757556.0, "reward": 0.0937500074505806, "reward_std": 0.27560853958129883, "rewards/reward_func/mean": 0.0937500074505806, "rewards/reward_func/std": 0.36769309639930725, "sampling/importance_sampling_ratio/max": 1.4687750339508057, "sampling/importance_sampling_ratio/mean": 0.9616619348526001, "sampling/importance_sampling_ratio/min": 0.2960628867149353, "sampling/sampling_logp_difference/max": 0.517666220664978, "sampling/sampling_logp_difference/mean": 0.028408560901880264, "step": 496, "step_time": 87.52318387202104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.31511783599853516, "epoch": 0.994, "frac_reward_zero_std": 0.0, "grad_norm": 0.8182615637779236, "kl": 0.014653578400611877, "learning_rate": 2.6537438036295876e-06, "loss": -0.0539, "num_tokens": 2763537.0, "reward": 0.45750001072883606, "reward_std": 0.5164840221405029, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.5492787957191467, "sampling/importance_sampling_ratio/max": 1.4413000345230103, "sampling/importance_sampling_ratio/mean": 0.7662212252616882, "sampling/importance_sampling_ratio/min": 0.33490437269210815, "sampling/sampling_logp_difference/max": 0.8015744686126709, "sampling/sampling_logp_difference/mean": 0.022109784185886383, "step": 497, "step_time": 83.04217743998743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34117379784584045, "epoch": 0.996, "frac_reward_zero_std": 0.0, "grad_norm": 1.383134365081787, "kl": 0.03160976245999336, "learning_rate": 2.6456614486233344e-06, "loss": 0.0937, "num_tokens": 2768283.0, "reward": 0.45125001668930054, "reward_std": 0.611153244972229, "rewards/reward_func/mean": 0.45125001668930054, "rewards/reward_func/std": 0.5658984780311584, "sampling/importance_sampling_ratio/max": 1.6628714799880981, "sampling/importance_sampling_ratio/mean": 1.1422840356826782, "sampling/importance_sampling_ratio/min": 0.6167079210281372, "sampling/sampling_logp_difference/max": 0.4831216335296631, "sampling/sampling_logp_difference/mean": 0.025718016549944878, "step": 498, "step_time": 52.98551483498886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.40990570187568665, "epoch": 0.998, "frac_reward_zero_std": 0.0, "grad_norm": 1.3361786603927612, "kl": 0.02863333187997341, "learning_rate": 2.6375775656974124e-06, "loss": 0.1209, "num_tokens": 2773418.0, "reward": 0.33250001072883606, "reward_std": 0.5637004375457764, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.545494556427002, "sampling/importance_sampling_ratio/max": 1.6512422561645508, "sampling/importance_sampling_ratio/mean": 1.0369747877120972, "sampling/importance_sampling_ratio/min": 0.7347527146339417, "sampling/sampling_logp_difference/max": 0.4192899465560913, "sampling/sampling_logp_difference/mean": 0.0260856244713068, "step": 499, "step_time": 81.9041408339981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.36693620681762695, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.0463021993637085, "kl": 0.029872596263885498, "learning_rate": 2.6294922396479263e-06, "loss": -0.2292, "num_tokens": 2778968.0, "reward": 0.20874999463558197, "reward_std": 0.3164796531200409, "rewards/reward_func/mean": 0.20874999463558197, "rewards/reward_func/std": 0.4829207956790924, "sampling/importance_sampling_ratio/max": 1.7435824871063232, "sampling/importance_sampling_ratio/mean": 0.9001740217208862, "sampling/importance_sampling_ratio/min": 0.30285191535949707, "sampling/sampling_logp_difference/max": 0.6381608247756958, "sampling/sampling_logp_difference/mean": 0.031203145161271095, "step": 500, "step_time": 110.34948237799108 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 2778968, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }