diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.35832011699676514, + "epoch": 0.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.264101505279541, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.2758, + "num_tokens": 5417.0, + "reward": 0.4775000214576721, + "reward_std": 0.5056283473968506, + "rewards/reward_func/mean": 0.4775000214576721, + "rewards/reward_func/std": 0.5403900742530823, + "sampling/importance_sampling_ratio/max": 2.4071154594421387, + "sampling/importance_sampling_ratio/mean": 1.1429595947265625, + "sampling/importance_sampling_ratio/min": 0.5015585422515869, + "sampling/sampling_logp_difference/max": 0.5305562019348145, + "sampling/sampling_logp_difference/mean": 0.024324804544448853, + "step": 1, + "step_time": 30.05394913199416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3473261594772339, + "epoch": 0.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3787319660186768, + "kl": 0.0, + "learning_rate": 1.6666666666666668e-07, + "loss": 0.2918, + "num_tokens": 11253.0, + "reward": 0.581250011920929, + "reward_std": 0.5712425708770752, + "rewards/reward_func/mean": 0.581250011920929, + "rewards/reward_func/std": 0.5513473749160767, + "sampling/importance_sampling_ratio/max": 2.3380353450775146, + "sampling/importance_sampling_ratio/mean": 1.2109484672546387, + "sampling/importance_sampling_ratio/min": 0.4137703776359558, + "sampling/sampling_logp_difference/max": 0.6683757305145264, + "sampling/sampling_logp_difference/mean": 0.024658963084220886, + "step": 2, + "step_time": 40.791004868005984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.360579252243042, + "epoch": 0.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.022918939590454, + "kl": 0.0012595340376719832, + "learning_rate": 3.3333333333333335e-07, + "loss": -0.118, + "num_tokens": 16681.0, + "reward": 0.48250001668930054, + "reward_std": 0.5084458589553833, + "rewards/reward_func/mean": 0.48250001668930054, + "rewards/reward_func/std": 0.5402578711509705, + "sampling/importance_sampling_ratio/max": 1.7170885801315308, + "sampling/importance_sampling_ratio/mean": 0.9650065898895264, + "sampling/importance_sampling_ratio/min": 0.30409955978393555, + "sampling/sampling_logp_difference/max": 0.5745421648025513, + "sampling/sampling_logp_difference/mean": 0.02655378170311451, + "step": 3, + "step_time": 32.76647898698866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 55.75, + "completions/mean_terminated_length": 55.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3717118501663208, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.620926856994629, + "kl": 0.0016124368412420154, + "learning_rate": 5.000000000000001e-07, + "loss": -0.3319, + "num_tokens": 22512.0, + "reward": 0.3187499940395355, + "reward_std": 0.576077938079834, + "rewards/reward_func/mean": 0.3187499940395355, + "rewards/reward_func/std": 0.5602917075157166, + "sampling/importance_sampling_ratio/max": 2.1794252395629883, + "sampling/importance_sampling_ratio/mean": 1.2134031057357788, + "sampling/importance_sampling_ratio/min": 0.6474471092224121, + "sampling/sampling_logp_difference/max": 0.5795614719390869, + "sampling/sampling_logp_difference/mean": 0.025603361427783966, + "step": 4, + "step_time": 37.759238163998816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3655037581920624, + "epoch": 0.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.429198980331421, + "kl": 0.0023652869276702404, + "learning_rate": 6.666666666666667e-07, + "loss": 0.1995, + "num_tokens": 28617.0, + "reward": 0.17875000834465027, + "reward_std": 0.5367715358734131, + "rewards/reward_func/mean": 0.17875000834465027, + "rewards/reward_func/std": 0.4974919259548187, + "sampling/importance_sampling_ratio/max": 2.4826161861419678, + "sampling/importance_sampling_ratio/mean": 1.161120057106018, + "sampling/importance_sampling_ratio/min": 0.5131281018257141, + "sampling/sampling_logp_difference/max": 0.5112643241882324, + "sampling/sampling_logp_difference/mean": 0.024323755875229836, + "step": 5, + "step_time": 41.24092036399816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3876664638519287, + "epoch": 0.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3607584238052368, + "kl": 0.001689540920779109, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0571, + "num_tokens": 33384.0, + "reward": 0.5862500071525574, + "reward_std": 0.5741689205169678, + "rewards/reward_func/mean": 0.5862500071525574, + "rewards/reward_func/std": 0.5580050945281982, + "sampling/importance_sampling_ratio/max": 1.6102370023727417, + "sampling/importance_sampling_ratio/mean": 0.898230254650116, + "sampling/importance_sampling_ratio/min": 0.34930747747421265, + "sampling/sampling_logp_difference/max": 0.7642672061920166, + "sampling/sampling_logp_difference/mean": 0.027652274817228317, + "step": 6, + "step_time": 23.575158892999752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3462250828742981, + "epoch": 0.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7692639827728271, + "kl": 0.0016474956646561623, + "learning_rate": 1.0000000000000002e-06, + "loss": -0.1133, + "num_tokens": 39130.0, + "reward": 0.16500000655651093, + "reward_std": 0.5287132263183594, + "rewards/reward_func/mean": 0.16500000655651093, + "rewards/reward_func/std": 0.4902477562427521, + "sampling/importance_sampling_ratio/max": 1.69410240650177, + "sampling/importance_sampling_ratio/mean": 1.0078184604644775, + "sampling/importance_sampling_ratio/min": 0.47111791372299194, + "sampling/sampling_logp_difference/max": 0.5242133140563965, + "sampling/sampling_logp_difference/mean": 0.01969078555703163, + "step": 7, + "step_time": 42.84424216199841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3769228458404541, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4515846967697144, + "kl": 0.0016455797012895346, + "learning_rate": 1.1666666666666668e-06, + "loss": -0.0421, + "num_tokens": 45078.0, + "reward": 0.09000000357627869, + "reward_std": 0.27620843052864075, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.36974895000457764, + "sampling/importance_sampling_ratio/max": 1.7868865728378296, + "sampling/importance_sampling_ratio/mean": 1.0823638439178467, + "sampling/importance_sampling_ratio/min": 0.3364071249961853, + "sampling/sampling_logp_difference/max": 0.5305700898170471, + "sampling/sampling_logp_difference/mean": 0.02302919700741768, + "step": 8, + "step_time": 35.18679962800525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.29516422748565674, + "epoch": 0.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.408183217048645, + "kl": 0.0017756135202944279, + "learning_rate": 1.3333333333333334e-06, + "loss": -0.4256, + "num_tokens": 50752.0, + "reward": 0.30375000834465027, + "reward_std": 0.5890097618103027, + "rewards/reward_func/mean": 0.30375000834465027, + "rewards/reward_func/std": 0.5595645308494568, + "sampling/importance_sampling_ratio/max": 2.3854382038116455, + "sampling/importance_sampling_ratio/mean": 0.949848473072052, + "sampling/importance_sampling_ratio/min": 0.43223410844802856, + "sampling/sampling_logp_difference/max": 0.3985975682735443, + "sampling/sampling_logp_difference/mean": 0.024290772154927254, + "step": 9, + "step_time": 36.36361191300966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3654099106788635, + "epoch": 0.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.196986675262451, + "kl": 0.0032762186601758003, + "learning_rate": 1.5e-06, + "loss": -0.2107, + "num_tokens": 56409.0, + "reward": 0.09375, + "reward_std": 0.2810794711112976, + "rewards/reward_func/mean": 0.09375, + "rewards/reward_func/std": 0.36730435490608215, + "sampling/importance_sampling_ratio/max": 2.5383903980255127, + "sampling/importance_sampling_ratio/mean": 1.4091622829437256, + "sampling/importance_sampling_ratio/min": 0.6962175369262695, + "sampling/sampling_logp_difference/max": 1.2695891857147217, + "sampling/sampling_logp_difference/mean": 0.030169054865837097, + "step": 10, + "step_time": 38.47457867600315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.4065442681312561, + "epoch": 0.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2501963376998901, + "kl": 0.0023037088103592396, + "learning_rate": 1.6666666666666667e-06, + "loss": -0.0843, + "num_tokens": 62619.0, + "reward": 0.07124999910593033, + "reward_std": 0.29781630635261536, + "rewards/reward_func/mean": 0.07124999910593033, + "rewards/reward_func/std": 0.37745150923728943, + "sampling/importance_sampling_ratio/max": 1.785474419593811, + "sampling/importance_sampling_ratio/mean": 0.9539740085601807, + "sampling/importance_sampling_ratio/min": 0.431792289018631, + "sampling/sampling_logp_difference/max": 0.6463680267333984, + "sampling/sampling_logp_difference/mean": 0.023907780647277832, + "step": 11, + "step_time": 36.200093806008226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35928064584732056, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0825868844985962, + "kl": 0.0015529417432844639, + "learning_rate": 1.8333333333333333e-06, + "loss": 0.1996, + "num_tokens": 68221.0, + "reward": 0.08125000447034836, + "reward_std": 0.2829767167568207, + "rewards/reward_func/mean": 0.08125000447034836, + "rewards/reward_func/std": 0.3724604547023773, + "sampling/importance_sampling_ratio/max": 1.8231761455535889, + "sampling/importance_sampling_ratio/mean": 1.0552520751953125, + "sampling/importance_sampling_ratio/min": 0.6413195133209229, + "sampling/sampling_logp_difference/max": 0.5809029340744019, + "sampling/sampling_logp_difference/mean": 0.02717738226056099, + "step": 12, + "step_time": 36.93798936299572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3927002549171448, + "epoch": 0.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.864451289176941, + "kl": 0.0021730177104473114, + "learning_rate": 2.0000000000000003e-06, + "loss": -0.2762, + "num_tokens": 74060.0, + "reward": 0.08249999582767487, + "reward_std": 0.2659415602684021, + "rewards/reward_func/mean": 0.08249999582767487, + "rewards/reward_func/std": 0.35664108395576477, + "sampling/importance_sampling_ratio/max": 2.3068079948425293, + "sampling/importance_sampling_ratio/mean": 1.3477238416671753, + "sampling/importance_sampling_ratio/min": 0.39379340410232544, + "sampling/sampling_logp_difference/max": 0.6892986297607422, + "sampling/sampling_logp_difference/mean": 0.028836481273174286, + "step": 13, + "step_time": 39.853118643004564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.40448594093322754, + "epoch": 0.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6799466609954834, + "kl": 0.001972722355276346, + "learning_rate": 2.166666666666667e-06, + "loss": -0.0182, + "num_tokens": 78889.0, + "reward": 0.3199999928474426, + "reward_std": 0.5358837246894836, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.5208784341812134, + "sampling/importance_sampling_ratio/max": 1.972301721572876, + "sampling/importance_sampling_ratio/mean": 1.1966495513916016, + "sampling/importance_sampling_ratio/min": 0.5895167589187622, + "sampling/sampling_logp_difference/max": 0.33179569244384766, + "sampling/sampling_logp_difference/mean": 0.02415962889790535, + "step": 14, + "step_time": 30.755012333000195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3787211775779724, + "epoch": 0.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1619508266448975, + "kl": 0.0027514053508639336, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.1111, + "num_tokens": 84325.0, + "reward": 0.48250001668930054, + "reward_std": 0.5952338576316833, + "rewards/reward_func/mean": 0.48250001668930054, + "rewards/reward_func/std": 0.5513554811477661, + "sampling/importance_sampling_ratio/max": 1.4301223754882812, + "sampling/importance_sampling_ratio/mean": 0.8711071610450745, + "sampling/importance_sampling_ratio/min": 0.36811545491218567, + "sampling/sampling_logp_difference/max": 0.8878096342086792, + "sampling/sampling_logp_difference/mean": 0.026348719373345375, + "step": 15, + "step_time": 28.58861476900347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3328179121017456, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6959343552589417, + "kl": 0.001039550406858325, + "learning_rate": 2.5e-06, + "loss": -0.0017, + "num_tokens": 89747.0, + "reward": 0.0912499949336052, + "reward_std": 0.2814132273197174, + "rewards/reward_func/mean": 0.0912499949336052, + "rewards/reward_func/std": 0.3689536452293396, + "sampling/importance_sampling_ratio/max": 1.175565481185913, + "sampling/importance_sampling_ratio/mean": 0.8306176066398621, + "sampling/importance_sampling_ratio/min": 0.3712834417819977, + "sampling/sampling_logp_difference/max": 0.5475239753723145, + "sampling/sampling_logp_difference/mean": 0.024504121392965317, + "step": 16, + "step_time": 36.591242304988555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3186699450016022, + "epoch": 0.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1346180438995361, + "kl": 0.0015763898845762014, + "learning_rate": 2.666666666666667e-06, + "loss": -0.1755, + "num_tokens": 95132.0, + "reward": 0.4337500035762787, + "reward_std": 0.08031108975410461, + "rewards/reward_func/mean": 0.4337500035762787, + "rewards/reward_func/std": 0.548945426940918, + "sampling/importance_sampling_ratio/max": 1.9401581287384033, + "sampling/importance_sampling_ratio/mean": 0.9301601648330688, + "sampling/importance_sampling_ratio/min": 0.5294641852378845, + "sampling/sampling_logp_difference/max": 0.3347742557525635, + "sampling/sampling_logp_difference/mean": 0.018799789249897003, + "step": 17, + "step_time": 44.30997213399678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 58.125, + "completions/mean_terminated_length": 58.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.41471022367477417, + "epoch": 0.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5832308530807495, + "kl": 0.0018035045359283686, + "learning_rate": 2.8333333333333335e-06, + "loss": 0.1381, + "num_tokens": 101131.0, + "reward": 0.32875001430511475, + "reward_std": 0.5735915303230286, + "rewards/reward_func/mean": 0.32875001430511475, + "rewards/reward_func/std": 0.548646092414856, + "sampling/importance_sampling_ratio/max": 2.193835496902466, + "sampling/importance_sampling_ratio/mean": 1.2385116815567017, + "sampling/importance_sampling_ratio/min": 0.7738122344017029, + "sampling/sampling_logp_difference/max": 0.41960763931274414, + "sampling/sampling_logp_difference/mean": 0.025856416672468185, + "step": 18, + "step_time": 39.32068163600343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3718729615211487, + "epoch": 0.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9075581431388855, + "kl": 0.0021873265504837036, + "learning_rate": 3e-06, + "loss": -0.0808, + "num_tokens": 106359.0, + "reward": -0.04749999940395355, + "reward_std": 0.04638735204935074, + "rewards/reward_func/mean": -0.04749999940395355, + "rewards/reward_func/std": 0.04527692496776581, + "sampling/importance_sampling_ratio/max": 1.4301691055297852, + "sampling/importance_sampling_ratio/mean": 0.8160060048103333, + "sampling/importance_sampling_ratio/min": 0.3894173800945282, + "sampling/sampling_logp_difference/max": 0.7142742872238159, + "sampling/sampling_logp_difference/mean": 0.026370640844106674, + "step": 19, + "step_time": 38.445566442009294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.340387225151062, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.247363805770874, + "kl": 0.001818232238292694, + "learning_rate": 3.1666666666666667e-06, + "loss": 0.0987, + "num_tokens": 112295.0, + "reward": 0.07375000417232513, + "reward_std": 0.288117915391922, + "rewards/reward_func/mean": 0.07375000417232513, + "rewards/reward_func/std": 0.3656671941280365, + "sampling/importance_sampling_ratio/max": 2.058504581451416, + "sampling/importance_sampling_ratio/mean": 1.1552469730377197, + "sampling/importance_sampling_ratio/min": 0.7517771124839783, + "sampling/sampling_logp_difference/max": 0.4139009714126587, + "sampling/sampling_logp_difference/mean": 0.02037879265844822, + "step": 20, + "step_time": 37.69729056301003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.37244129180908203, + "epoch": 0.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0636333227157593, + "kl": 0.0019398150034248829, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1205, + "num_tokens": 118239.0, + "reward": 0.20250000059604645, + "reward_std": 0.3253607749938965, + "rewards/reward_func/mean": 0.20250000059604645, + "rewards/reward_func/std": 0.49331969022750854, + "sampling/importance_sampling_ratio/max": 1.381617546081543, + "sampling/importance_sampling_ratio/mean": 0.9092652201652527, + "sampling/importance_sampling_ratio/min": 0.5254658460617065, + "sampling/sampling_logp_difference/max": 0.706791877746582, + "sampling/sampling_logp_difference/mean": 0.025023311376571655, + "step": 21, + "step_time": 42.413751760002924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.32273274660110474, + "epoch": 0.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8321790099143982, + "kl": 0.0011215595295652747, + "learning_rate": 3.5e-06, + "loss": 0.0122, + "num_tokens": 123987.0, + "reward": 0.06625001132488251, + "reward_std": 0.292948454618454, + "rewards/reward_func/mean": 0.06625001132488251, + "rewards/reward_func/std": 0.3691278398036957, + "sampling/importance_sampling_ratio/max": 1.8723653554916382, + "sampling/importance_sampling_ratio/mean": 0.8478108644485474, + "sampling/importance_sampling_ratio/min": 0.38013601303100586, + "sampling/sampling_logp_difference/max": 0.4778859615325928, + "sampling/sampling_logp_difference/mean": 0.022008519619703293, + "step": 22, + "step_time": 36.69069067799137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.36345064640045166, + "epoch": 0.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9762702584266663, + "kl": 0.0013981228694319725, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.2781, + "num_tokens": 129451.0, + "reward": 0.2224999964237213, + "reward_std": 0.5127884149551392, + "rewards/reward_func/mean": 0.2224999964237213, + "rewards/reward_func/std": 0.4748157858848572, + "sampling/importance_sampling_ratio/max": 1.6413739919662476, + "sampling/importance_sampling_ratio/mean": 0.99114990234375, + "sampling/importance_sampling_ratio/min": 0.44171378016471863, + "sampling/sampling_logp_difference/max": 0.47839784622192383, + "sampling/sampling_logp_difference/mean": 0.02052409201860428, + "step": 23, + "step_time": 32.5673299110058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.36843228340148926, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1462467908859253, + "kl": 0.0015321827959269285, + "learning_rate": 3.833333333333334e-06, + "loss": 0.0698, + "num_tokens": 135132.0, + "reward": 0.23000001907348633, + "reward_std": 0.5106528997421265, + "rewards/reward_func/mean": 0.23000001907348633, + "rewards/reward_func/std": 0.4728032052516937, + "sampling/importance_sampling_ratio/max": 1.2816814184188843, + "sampling/importance_sampling_ratio/mean": 0.9215267300605774, + "sampling/importance_sampling_ratio/min": 0.5011075139045715, + "sampling/sampling_logp_difference/max": 0.5086667537689209, + "sampling/sampling_logp_difference/mean": 0.0266867745667696, + "step": 24, + "step_time": 32.060357182999724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3656223714351654, + "epoch": 0.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6805341243743896, + "kl": 0.00146653619594872, + "learning_rate": 4.000000000000001e-06, + "loss": -0.3181, + "num_tokens": 140635.0, + "reward": 0.3475000262260437, + "reward_std": 0.5446269512176514, + "rewards/reward_func/mean": 0.3475000262260437, + "rewards/reward_func/std": 0.5270063877105713, + "sampling/importance_sampling_ratio/max": 1.6357054710388184, + "sampling/importance_sampling_ratio/mean": 1.019447684288025, + "sampling/importance_sampling_ratio/min": 0.44408416748046875, + "sampling/sampling_logp_difference/max": 0.3505585193634033, + "sampling/sampling_logp_difference/mean": 0.022699594497680664, + "step": 25, + "step_time": 27.742008060988155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3471141457557678, + "epoch": 0.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8676110506057739, + "kl": 0.0014656296698376536, + "learning_rate": 4.166666666666667e-06, + "loss": -0.0186, + "num_tokens": 146738.0, + "reward": -0.058750003576278687, + "reward_std": 0.0412052683532238, + "rewards/reward_func/mean": -0.058750003576278687, + "rewards/reward_func/std": 0.041554611176252365, + "sampling/importance_sampling_ratio/max": 1.3209142684936523, + "sampling/importance_sampling_ratio/mean": 0.8133621215820312, + "sampling/importance_sampling_ratio/min": 0.41941919922828674, + "sampling/sampling_logp_difference/max": 0.35713934898376465, + "sampling/sampling_logp_difference/mean": 0.02249450981616974, + "step": 26, + "step_time": 44.71595883600821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.32017290592193604, + "epoch": 0.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609636664390564, + "kl": 0.0014645641203969717, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0422, + "num_tokens": 151823.0, + "reward": 0.09000000357627869, + "reward_std": 0.28282541036605835, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.37028947472572327, + "sampling/importance_sampling_ratio/max": 1.3593155145645142, + "sampling/importance_sampling_ratio/mean": 0.9048177003860474, + "sampling/importance_sampling_ratio/min": 0.6389055848121643, + "sampling/sampling_logp_difference/max": 0.3579772114753723, + "sampling/sampling_logp_difference/mean": 0.021750561892986298, + "step": 27, + "step_time": 33.81092618100229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.437652051448822, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9713760614395142, + "kl": 0.0016673305071890354, + "learning_rate": 4.5e-06, + "loss": -0.4088, + "num_tokens": 157338.0, + "reward": 0.3199999928474426, + "reward_std": 0.5766737461090088, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.558569610118866, + "sampling/importance_sampling_ratio/max": 2.007301092147827, + "sampling/importance_sampling_ratio/mean": 1.1249253749847412, + "sampling/importance_sampling_ratio/min": 0.32354435324668884, + "sampling/sampling_logp_difference/max": 0.32819199562072754, + "sampling/sampling_logp_difference/mean": 0.02238292247056961, + "step": 28, + "step_time": 37.15755737799918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.4016689658164978, + "epoch": 0.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5781503915786743, + "kl": 0.0016747142653912306, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0816, + "num_tokens": 163567.0, + "reward": 0.33500000834465027, + "reward_std": 0.5604823231697083, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5377200245857239, + "sampling/importance_sampling_ratio/max": 1.7749114036560059, + "sampling/importance_sampling_ratio/mean": 1.0603770017623901, + "sampling/importance_sampling_ratio/min": 0.6003548502922058, + "sampling/sampling_logp_difference/max": 0.447523832321167, + "sampling/sampling_logp_difference/mean": 0.02700149640440941, + "step": 29, + "step_time": 38.102109844010556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3838121294975281, + "epoch": 0.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1578418016433716, + "kl": 0.002017122693359852, + "learning_rate": 4.833333333333333e-06, + "loss": 0.0726, + "num_tokens": 169243.0, + "reward": 0.19749999046325684, + "reward_std": 0.5363912582397461, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.49660995602607727, + "sampling/importance_sampling_ratio/max": 2.339462995529175, + "sampling/importance_sampling_ratio/mean": 1.108468770980835, + "sampling/importance_sampling_ratio/min": 0.5691421031951904, + "sampling/sampling_logp_difference/max": 0.38687825202941895, + "sampling/sampling_logp_difference/mean": 0.027435339987277985, + "step": 30, + "step_time": 34.95135859200673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 40.625, + "completions/mean_terminated_length": 40.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.33706551790237427, + "epoch": 0.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9564450979232788, + "kl": 0.0014677501749247313, + "learning_rate": 5e-06, + "loss": 0.084, + "num_tokens": 175424.0, + "reward": 0.32374998927116394, + "reward_std": 0.5670105218887329, + "rewards/reward_func/mean": 0.32374998927116394, + "rewards/reward_func/std": 0.5370538234710693, + "sampling/importance_sampling_ratio/max": 1.2009906768798828, + "sampling/importance_sampling_ratio/mean": 0.8830825090408325, + "sampling/importance_sampling_ratio/min": 0.47186899185180664, + "sampling/sampling_logp_difference/max": 0.47310686111450195, + "sampling/sampling_logp_difference/mean": 0.02114824578166008, + "step": 31, + "step_time": 39.84195031199488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 57.625, + "completions/mean_terminated_length": 57.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.35063254833221436, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9092409610748291, + "kl": 0.0016583865508437157, + "learning_rate": 4.99998688809149e-06, + "loss": -0.0826, + "num_tokens": 180456.0, + "reward": 0.08749999850988388, + "reward_std": 0.2758382558822632, + "rewards/reward_func/mean": 0.08749999850988388, + "rewards/reward_func/std": 0.369314044713974, + "sampling/importance_sampling_ratio/max": 1.4586107730865479, + "sampling/importance_sampling_ratio/mean": 0.7794057130813599, + "sampling/importance_sampling_ratio/min": 0.26318106055259705, + "sampling/sampling_logp_difference/max": 0.4512190818786621, + "sampling/sampling_logp_difference/mean": 0.02120751515030861, + "step": 32, + "step_time": 35.140949385997374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.37451615929603577, + "epoch": 0.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.266938328742981, + "kl": 0.0013357822317630053, + "learning_rate": 4.9999475525034974e-06, + "loss": 0.2632, + "num_tokens": 186220.0, + "reward": -0.05249999836087227, + "reward_std": 0.060625821352005005, + "rewards/reward_func/mean": -0.05249999836087227, + "rewards/reward_func/std": 0.06363961100578308, + "sampling/importance_sampling_ratio/max": 1.8263978958129883, + "sampling/importance_sampling_ratio/mean": 1.2238435745239258, + "sampling/importance_sampling_ratio/min": 0.7157539129257202, + "sampling/sampling_logp_difference/max": 0.33373260498046875, + "sampling/sampling_logp_difference/mean": 0.021757911890745163, + "step": 33, + "step_time": 43.75499219499761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.35137397050857544, + "epoch": 0.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.322182536125183, + "kl": 0.0025231819599866867, + "learning_rate": 4.999881993648633e-06, + "loss": -0.1529, + "num_tokens": 191647.0, + "reward": 0.1850000023841858, + "reward_std": 0.5347095727920532, + "rewards/reward_func/mean": 0.1850000023841858, + "rewards/reward_func/std": 0.4952344596385956, + "sampling/importance_sampling_ratio/max": 1.5585519075393677, + "sampling/importance_sampling_ratio/mean": 0.9386357665061951, + "sampling/importance_sampling_ratio/min": 0.322815865278244, + "sampling/sampling_logp_difference/max": 0.49080967903137207, + "sampling/sampling_logp_difference/mean": 0.024871371686458588, + "step": 34, + "step_time": 40.144638138997834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.38170647621154785, + "epoch": 0.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953609585762024, + "kl": 0.001635606400668621, + "learning_rate": 4.99979021221458e-06, + "loss": -0.0461, + "num_tokens": 197510.0, + "reward": 0.2150000035762787, + "reward_std": 0.5176790952682495, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.47931498289108276, + "sampling/importance_sampling_ratio/max": 2.3140695095062256, + "sampling/importance_sampling_ratio/mean": 1.449246883392334, + "sampling/importance_sampling_ratio/min": 0.5865305066108704, + "sampling/sampling_logp_difference/max": 0.3477973937988281, + "sampling/sampling_logp_difference/mean": 0.025315163657069206, + "step": 35, + "step_time": 33.668802553002024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 63.375, + "completions/mean_terminated_length": 63.375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.3781251311302185, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1683272123336792, + "kl": 0.0017781654605641961, + "learning_rate": 4.9996722091640805e-06, + "loss": -0.0293, + "num_tokens": 202499.0, + "reward": 0.3174999952316284, + "reward_std": 0.5784124135971069, + "rewards/reward_func/mean": 0.3174999952316284, + "rewards/reward_func/std": 0.5600191354751587, + "sampling/importance_sampling_ratio/max": 1.6650283336639404, + "sampling/importance_sampling_ratio/mean": 1.2166297435760498, + "sampling/importance_sampling_ratio/min": 0.7639206647872925, + "sampling/sampling_logp_difference/max": 0.44861912727355957, + "sampling/sampling_logp_difference/mean": 0.019233262166380882, + "step": 36, + "step_time": 31.37140485800046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.42192620038986206, + "epoch": 0.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7755328416824341, + "kl": 0.00290558859705925, + "learning_rate": 4.999527985734932e-06, + "loss": 0.1481, + "num_tokens": 208194.0, + "reward": 0.16625000536441803, + "reward_std": 0.3537360727787018, + "rewards/reward_func/mean": 0.16625000536441803, + "rewards/reward_func/std": 0.49612608551979065, + "sampling/importance_sampling_ratio/max": 1.2127708196640015, + "sampling/importance_sampling_ratio/mean": 0.6914026737213135, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.3314962387084961, + "sampling/sampling_logp_difference/mean": 0.026732761412858963, + "step": 37, + "step_time": 41.92197997200128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 45.0, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3740369379520416, + "epoch": 0.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.343247652053833, + "kl": 0.0024613114073872566, + "learning_rate": 4.999357543439969e-06, + "loss": 0.0229, + "num_tokens": 213926.0, + "reward": 0.23625001311302185, + "reward_std": 0.49587899446487427, + "rewards/reward_func/mean": 0.23625001311302185, + "rewards/reward_func/std": 0.4592210054397583, + "sampling/importance_sampling_ratio/max": 1.9151374101638794, + "sampling/importance_sampling_ratio/mean": 1.0875842571258545, + "sampling/importance_sampling_ratio/min": 0.6512177586555481, + "sampling/sampling_logp_difference/max": 0.4941213130950928, + "sampling/sampling_logp_difference/mean": 0.02705160342156887, + "step": 38, + "step_time": 35.657528919997276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.38872674107551575, + "epoch": 0.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.444061517715454, + "kl": 0.00284218811430037, + "learning_rate": 4.999160884067051e-06, + "loss": 0.1495, + "num_tokens": 219542.0, + "reward": 0.4675000011920929, + "reward_std": 0.5747828483581543, + "rewards/reward_func/mean": 0.4675000011920929, + "rewards/reward_func/std": 0.5322123765945435, + "sampling/importance_sampling_ratio/max": 2.0046651363372803, + "sampling/importance_sampling_ratio/mean": 0.9659349918365479, + "sampling/importance_sampling_ratio/min": 0.2745288014411926, + "sampling/sampling_logp_difference/max": 0.6689493656158447, + "sampling/sampling_logp_difference/mean": 0.03054666332900524, + "step": 39, + "step_time": 41.85108021501219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34974485635757446, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2580375671386719, + "kl": 0.0013389361556619406, + "learning_rate": 4.9989380096790416e-06, + "loss": -0.1842, + "num_tokens": 225548.0, + "reward": 0.07874999940395355, + "reward_std": 0.2814640700817108, + "rewards/reward_func/mean": 0.07874999940395355, + "rewards/reward_func/std": 0.37380045652389526, + "sampling/importance_sampling_ratio/max": 1.6368787288665771, + "sampling/importance_sampling_ratio/mean": 1.1340277194976807, + "sampling/importance_sampling_ratio/min": 0.5276371836662292, + "sampling/sampling_logp_difference/max": 0.4883451461791992, + "sampling/sampling_logp_difference/mean": 0.019177088513970375, + "step": 40, + "step_time": 49.17670016500051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3754725456237793, + "epoch": 0.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3454259634017944, + "kl": 0.002098641125485301, + "learning_rate": 4.998688922613788e-06, + "loss": -0.2422, + "num_tokens": 231041.0, + "reward": 0.08375000208616257, + "reward_std": 0.2680739760398865, + "rewards/reward_func/mean": 0.08375000208616257, + "rewards/reward_func/std": 0.3638656735420227, + "sampling/importance_sampling_ratio/max": 1.3377426862716675, + "sampling/importance_sampling_ratio/mean": 0.807715892791748, + "sampling/importance_sampling_ratio/min": 0.4852953851222992, + "sampling/sampling_logp_difference/max": 0.6739339828491211, + "sampling/sampling_logp_difference/mean": 0.02791445143520832, + "step": 41, + "step_time": 35.51692575198831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3402999937534332, + "epoch": 0.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2757831811904907, + "kl": 0.0019170227460563183, + "learning_rate": 4.998413625484095e-06, + "loss": -0.1765, + "num_tokens": 236135.0, + "reward": 0.4612500071525574, + "reward_std": 0.6052623987197876, + "rewards/reward_func/mean": 0.4612500071525574, + "rewards/reward_func/std": 0.56057208776474, + "sampling/importance_sampling_ratio/max": 1.7278234958648682, + "sampling/importance_sampling_ratio/mean": 0.9955820441246033, + "sampling/importance_sampling_ratio/min": 0.2917014956474304, + "sampling/sampling_logp_difference/max": 0.5930330753326416, + "sampling/sampling_logp_difference/mean": 0.02681322768330574, + "step": 42, + "step_time": 28.90468980500009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 55.375, + "completions/mean_terminated_length": 55.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.43568652868270874, + "epoch": 0.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1961538791656494, + "kl": 0.0031014331616461277, + "learning_rate": 4.9981121211777e-06, + "loss": 0.132, + "num_tokens": 242383.0, + "reward": 0.07874999195337296, + "reward_std": 0.273262619972229, + "rewards/reward_func/mean": 0.07874999195337296, + "rewards/reward_func/std": 0.3736475706100464, + "sampling/importance_sampling_ratio/max": 1.8331316709518433, + "sampling/importance_sampling_ratio/mean": 0.9131045341491699, + "sampling/importance_sampling_ratio/min": 0.3854982852935791, + "sampling/sampling_logp_difference/max": 0.4006004333496094, + "sampling/sampling_logp_difference/mean": 0.02947426773607731, + "step": 43, + "step_time": 40.384530305993394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3090088665485382, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4481481313705444, + "kl": 0.0016173927579075098, + "learning_rate": 4.997784412857239e-06, + "loss": 0.1052, + "num_tokens": 248660.0, + "reward": -0.06874999403953552, + "reward_std": 0.04914231598377228, + "rewards/reward_func/mean": -0.06874999403953552, + "rewards/reward_func/std": 0.04764077067375183, + "sampling/importance_sampling_ratio/max": 1.2356544733047485, + "sampling/importance_sampling_ratio/mean": 0.9581372737884521, + "sampling/importance_sampling_ratio/min": 0.5436961054801941, + "sampling/sampling_logp_difference/max": 0.29942846298217773, + "sampling/sampling_logp_difference/mean": 0.015336403623223305, + "step": 44, + "step_time": 53.181460595995304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.37320029735565186, + "epoch": 0.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1145901679992676, + "kl": 0.003977172076702118, + "learning_rate": 4.99743050396022e-06, + "loss": 0.1775, + "num_tokens": 254142.0, + "reward": 0.45625001192092896, + "reward_std": 0.6143215894699097, + "rewards/reward_func/mean": 0.45625001192092896, + "rewards/reward_func/std": 0.5695847868919373, + "sampling/importance_sampling_ratio/max": 1.8426916599273682, + "sampling/importance_sampling_ratio/mean": 0.8322780728340149, + "sampling/importance_sampling_ratio/min": 0.19014649093151093, + "sampling/sampling_logp_difference/max": 0.40699613094329834, + "sampling/sampling_logp_difference/mean": 0.027948087081313133, + "step": 45, + "step_time": 30.825133632999496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3248752951622009, + "epoch": 0.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1806740760803223, + "kl": 0.0028996632900089025, + "learning_rate": 4.997050398198977e-06, + "loss": 0.1501, + "num_tokens": 259227.0, + "reward": 0.2162500023841858, + "reward_std": 0.3277096152305603, + "rewards/reward_func/mean": 0.2162500023841858, + "rewards/reward_func/std": 0.48576709628105164, + "sampling/importance_sampling_ratio/max": 1.6642227172851562, + "sampling/importance_sampling_ratio/mean": 1.01229727268219, + "sampling/importance_sampling_ratio/min": 0.4485871493816376, + "sampling/sampling_logp_difference/max": 0.46175622940063477, + "sampling/sampling_logp_difference/mean": 0.022898774594068527, + "step": 46, + "step_time": 35.48211781200371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.4646483063697815, + "epoch": 0.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5030003786087036, + "kl": 0.0027977940626442432, + "learning_rate": 4.9966440995606415e-06, + "loss": 0.1655, + "num_tokens": 264627.0, + "reward": 0.1887499988079071, + "reward_std": 0.5297601222991943, + "rewards/reward_func/mean": 0.1887499988079071, + "rewards/reward_func/std": 0.4906100332736969, + "sampling/importance_sampling_ratio/max": 2.4210548400878906, + "sampling/importance_sampling_ratio/mean": 1.32478928565979, + "sampling/importance_sampling_ratio/min": 0.4550531506538391, + "sampling/sampling_logp_difference/max": 0.4784013032913208, + "sampling/sampling_logp_difference/mean": 0.031322650611400604, + "step": 47, + "step_time": 30.883059543004492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3293306231498718, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8927807807922363, + "kl": 0.002377166645601392, + "learning_rate": 4.9962116123070925e-06, + "loss": 0.042, + "num_tokens": 270378.0, + "reward": 0.2162500023841858, + "reward_std": 0.4940055012702942, + "rewards/reward_func/mean": 0.2162500023841858, + "rewards/reward_func/std": 0.45831796526908875, + "sampling/importance_sampling_ratio/max": 1.1039625406265259, + "sampling/importance_sampling_ratio/mean": 0.8569784164428711, + "sampling/importance_sampling_ratio/min": 0.536712646484375, + "sampling/sampling_logp_difference/max": 0.34067440032958984, + "sampling/sampling_logp_difference/mean": 0.018954172730445862, + "step": 48, + "step_time": 38.16978211799869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.377954363822937, + "epoch": 0.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.487809658050537, + "kl": 0.0016349733341485262, + "learning_rate": 4.9957529409749185e-06, + "loss": 0.3189, + "num_tokens": 275988.0, + "reward": 0.07874999940395355, + "reward_std": 0.2923800051212311, + "rewards/reward_func/mean": 0.07874999940395355, + "rewards/reward_func/std": 0.3753260672092438, + "sampling/importance_sampling_ratio/max": 2.0353479385375977, + "sampling/importance_sampling_ratio/mean": 1.0705337524414062, + "sampling/importance_sampling_ratio/min": 0.35857802629470825, + "sampling/sampling_logp_difference/max": 0.5681980848312378, + "sampling/sampling_logp_difference/mean": 0.0232619009912014, + "step": 49, + "step_time": 35.3692782720027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.36007416248321533, + "epoch": 0.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4686626195907593, + "kl": 0.0032578343525528908, + "learning_rate": 4.995268090375362e-06, + "loss": -0.0069, + "num_tokens": 281915.0, + "reward": 0.1899999976158142, + "reward_std": 0.3334062099456787, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.4919930398464203, + "sampling/importance_sampling_ratio/max": 1.8520060777664185, + "sampling/importance_sampling_ratio/mean": 1.2530664205551147, + "sampling/importance_sampling_ratio/min": 0.6933834552764893, + "sampling/sampling_logp_difference/max": 0.3359670639038086, + "sampling/sampling_logp_difference/mean": 0.02223808318376541, + "step": 50, + "step_time": 39.47781550500076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3799910545349121, + "epoch": 0.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1234054565429688, + "kl": 0.003303245175629854, + "learning_rate": 4.99475706559428e-06, + "loss": -0.1382, + "num_tokens": 287424.0, + "reward": 0.3125, + "reward_std": 0.2701009511947632, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5506294965744019, + "sampling/importance_sampling_ratio/max": 1.6724814176559448, + "sampling/importance_sampling_ratio/mean": 0.9746721386909485, + "sampling/importance_sampling_ratio/min": 0.4102705419063568, + "sampling/sampling_logp_difference/max": 0.48910510540008545, + "sampling/sampling_logp_difference/mean": 0.02539738267660141, + "step": 51, + "step_time": 37.61800301600306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.31345584988594055, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0368928909301758, + "kl": 0.00263200793415308, + "learning_rate": 4.994219871992077e-06, + "loss": -0.0105, + "num_tokens": 292745.0, + "reward": 0.45499998331069946, + "reward_std": 0.5025076270103455, + "rewards/reward_func/mean": 0.45499998331069946, + "rewards/reward_func/std": 0.5353503227233887, + "sampling/importance_sampling_ratio/max": 1.1843898296356201, + "sampling/importance_sampling_ratio/mean": 0.8895880579948425, + "sampling/importance_sampling_ratio/min": 0.5178665518760681, + "sampling/sampling_logp_difference/max": 0.47790735960006714, + "sampling/sampling_logp_difference/mean": 0.026872076094150543, + "step": 52, + "step_time": 39.26654844600125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3391358554363251, + "epoch": 0.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0148347616195679, + "kl": 0.028528861701488495, + "learning_rate": 4.993656515203662e-06, + "loss": -0.0863, + "num_tokens": 298787.0, + "reward": 0.33249998092651367, + "reward_std": 0.5578641891479492, + "rewards/reward_func/mean": 0.33249998092651367, + "rewards/reward_func/std": 0.5344356298446655, + "sampling/importance_sampling_ratio/max": 1.4639109373092651, + "sampling/importance_sampling_ratio/mean": 0.7722151279449463, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7002308368682861, + "sampling/sampling_logp_difference/mean": 0.02879432588815689, + "step": 53, + "step_time": 37.527451172994915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3680269420146942, + "epoch": 0.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1720988750457764, + "kl": 0.005885637830942869, + "learning_rate": 4.99306700113838e-06, + "loss": -0.0454, + "num_tokens": 304206.0, + "reward": 0.3412500023841858, + "reward_std": 0.5495070815086365, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.5323650240898132, + "sampling/importance_sampling_ratio/max": 1.4783494472503662, + "sampling/importance_sampling_ratio/mean": 0.9287598729133606, + "sampling/importance_sampling_ratio/min": 0.6589941382408142, + "sampling/sampling_logp_difference/max": 0.5849330425262451, + "sampling/sampling_logp_difference/mean": 0.02534063160419464, + "step": 54, + "step_time": 37.79061970599287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 55.125, + "completions/mean_terminated_length": 55.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.42716383934020996, + "epoch": 0.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1009771823883057, + "kl": 0.002711753360927105, + "learning_rate": 4.9924513359799555e-06, + "loss": -0.1146, + "num_tokens": 309707.0, + "reward": 0.20000000298023224, + "reward_std": 0.5274509191513062, + "rewards/reward_func/mean": 0.20000000298023224, + "rewards/reward_func/std": 0.4887009263038635, + "sampling/importance_sampling_ratio/max": 2.2008090019226074, + "sampling/importance_sampling_ratio/mean": 1.0757707357406616, + "sampling/importance_sampling_ratio/min": 0.38931164145469666, + "sampling/sampling_logp_difference/max": 1.0060789585113525, + "sampling/sampling_logp_difference/mean": 0.027470823377370834, + "step": 55, + "step_time": 34.70951578000677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3485015034675598, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0752439498901367, + "kl": 0.0026783556677401066, + "learning_rate": 4.991809526186424e-06, + "loss": -0.0144, + "num_tokens": 314773.0, + "reward": 0.3149999976158142, + "reward_std": 0.5622599720954895, + "rewards/reward_func/mean": 0.3149999976158142, + "rewards/reward_func/std": 0.5434545874595642, + "sampling/importance_sampling_ratio/max": 1.2996971607208252, + "sampling/importance_sampling_ratio/mean": 0.9518929719924927, + "sampling/importance_sampling_ratio/min": 0.5955594778060913, + "sampling/sampling_logp_difference/max": 0.30002713203430176, + "sampling/sampling_logp_difference/mean": 0.020609542727470398, + "step": 56, + "step_time": 33.184696926007746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3204508423805237, + "epoch": 0.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1941601037979126, + "kl": 0.003310043830424547, + "learning_rate": 4.991141578490066e-06, + "loss": 0.2686, + "num_tokens": 320715.0, + "reward": 0.3387500047683716, + "reward_std": 0.5372268557548523, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5153483748435974, + "sampling/importance_sampling_ratio/max": 2.146245002746582, + "sampling/importance_sampling_ratio/mean": 1.03570556640625, + "sampling/importance_sampling_ratio/min": 0.36691558361053467, + "sampling/sampling_logp_difference/max": 0.3675193786621094, + "sampling/sampling_logp_difference/mean": 0.023061014711856842, + "step": 57, + "step_time": 38.93158349399164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.33570247888565063, + "epoch": 0.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9658681750297546, + "kl": 0.0020056082867085934, + "learning_rate": 4.990447499897339e-06, + "loss": -0.0436, + "num_tokens": 326183.0, + "reward": 0.3375000059604645, + "reward_std": 0.5548287630081177, + "rewards/reward_func/mean": 0.3375000059604645, + "rewards/reward_func/std": 0.5281707048416138, + "sampling/importance_sampling_ratio/max": 1.2019702196121216, + "sampling/importance_sampling_ratio/mean": 0.9152437448501587, + "sampling/importance_sampling_ratio/min": 0.4425993859767914, + "sampling/sampling_logp_difference/max": 0.31412577629089355, + "sampling/sampling_logp_difference/mean": 0.01732739806175232, + "step": 58, + "step_time": 31.71721384600096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3592735528945923, + "epoch": 0.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9438511729240417, + "kl": 0.002568789292126894, + "learning_rate": 4.989727297688797e-06, + "loss": -0.0309, + "num_tokens": 331769.0, + "reward": 0.1887499988079071, + "reward_std": 0.34991249442100525, + "rewards/reward_func/mean": 0.1887499988079071, + "rewards/reward_func/std": 0.5049027800559998, + "sampling/importance_sampling_ratio/max": 1.0347727537155151, + "sampling/importance_sampling_ratio/mean": 0.7055701017379761, + "sampling/importance_sampling_ratio/min": 0.3710009455680847, + "sampling/sampling_logp_difference/max": 0.6251668930053711, + "sampling/sampling_logp_difference/mean": 0.022848688066005707, + "step": 59, + "step_time": 40.923916693005594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.2875489294528961, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1665284633636475, + "kl": 0.0035660723224282265, + "learning_rate": 4.98898097941902e-06, + "loss": -0.0455, + "num_tokens": 336898.0, + "reward": -0.05624999850988388, + "reward_std": 0.04053955525159836, + "rewards/reward_func/mean": -0.05624999850988388, + "rewards/reward_func/std": 0.04206712171435356, + "sampling/importance_sampling_ratio/max": 2.0452516078948975, + "sampling/importance_sampling_ratio/mean": 1.0319852828979492, + "sampling/importance_sampling_ratio/min": 0.5795266032218933, + "sampling/sampling_logp_difference/max": 0.531651496887207, + "sampling/sampling_logp_difference/mean": 0.02353527769446373, + "step": 60, + "step_time": 33.48683463499765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3609686493873596, + "epoch": 0.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9293365478515625, + "kl": 0.002589891664683819, + "learning_rate": 4.988208552916535e-06, + "loss": -0.0442, + "num_tokens": 342410.0, + "reward": 0.20625001192092896, + "reward_std": 0.5265212059020996, + "rewards/reward_func/mean": 0.20625001192092896, + "rewards/reward_func/std": 0.48746979236602783, + "sampling/importance_sampling_ratio/max": 1.2214772701263428, + "sampling/importance_sampling_ratio/mean": 0.8363133668899536, + "sampling/importance_sampling_ratio/min": 0.4508911073207855, + "sampling/sampling_logp_difference/max": 0.46391355991363525, + "sampling/sampling_logp_difference/mean": 0.02813461422920227, + "step": 61, + "step_time": 35.721023104997585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 59.75, + "completions/mean_terminated_length": 59.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.38784918189048767, + "epoch": 0.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.075391173362732, + "kl": 0.0035081543028354645, + "learning_rate": 4.98741002628373e-06, + "loss": -0.0935, + "num_tokens": 347782.0, + "reward": 0.612500011920929, + "reward_std": 0.5446658134460449, + "rewards/reward_func/mean": 0.612500011920929, + "rewards/reward_func/std": 0.5239888429641724, + "sampling/importance_sampling_ratio/max": 1.6058270931243896, + "sampling/importance_sampling_ratio/mean": 0.8856536149978638, + "sampling/importance_sampling_ratio/min": 0.5055686831474304, + "sampling/sampling_logp_difference/max": 0.3406977653503418, + "sampling/sampling_logp_difference/mean": 0.025373805314302444, + "step": 62, + "step_time": 25.64868492000096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3399428427219391, + "epoch": 0.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1036839485168457, + "kl": 0.003299012314528227, + "learning_rate": 4.9865854078967715e-06, + "loss": -0.2298, + "num_tokens": 353226.0, + "reward": 0.6012499928474426, + "reward_std": 0.5651106834411621, + "rewards/reward_func/mean": 0.6012499928474426, + "rewards/reward_func/std": 0.5377848148345947, + "sampling/importance_sampling_ratio/max": 1.3698598146438599, + "sampling/importance_sampling_ratio/mean": 0.8536076545715332, + "sampling/importance_sampling_ratio/min": 0.3125140964984894, + "sampling/sampling_logp_difference/max": 1.194218635559082, + "sampling/sampling_logp_difference/mean": 0.023982733488082886, + "step": 63, + "step_time": 29.669544973992743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3663536310195923, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.173079252243042, + "kl": 0.0031317025423049927, + "learning_rate": 4.985734706405516e-06, + "loss": 0.2187, + "num_tokens": 359212.0, + "reward": -0.03999999910593033, + "reward_std": 0.04652039706707001, + "rewards/reward_func/mean": -0.03999999910593033, + "rewards/reward_func/std": 0.04956958070397377, + "sampling/importance_sampling_ratio/max": 1.2950257062911987, + "sampling/importance_sampling_ratio/mean": 0.8518853187561035, + "sampling/importance_sampling_ratio/min": 0.6529499888420105, + "sampling/sampling_logp_difference/max": 0.4296393394470215, + "sampling/sampling_logp_difference/mean": 0.027849294245243073, + "step": 64, + "step_time": 40.9083917550015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.39625003933906555, + "epoch": 0.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3868184089660645, + "kl": 0.003464975394308567, + "learning_rate": 4.9848579307334195e-06, + "loss": 0.1145, + "num_tokens": 365700.0, + "reward": 0.10374999791383743, + "reward_std": 0.2647004723548889, + "rewards/reward_func/mean": 0.10374999791383743, + "rewards/reward_func/std": 0.3624495267868042, + "sampling/importance_sampling_ratio/max": 1.4604157209396362, + "sampling/importance_sampling_ratio/mean": 1.0366566181182861, + "sampling/importance_sampling_ratio/min": 0.6759946346282959, + "sampling/sampling_logp_difference/max": 0.6729015111923218, + "sampling/sampling_logp_difference/mean": 0.02300150692462921, + "step": 65, + "step_time": 39.30383253400214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 57.125, + "completions/mean_terminated_length": 57.125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.4078044891357422, + "epoch": 0.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.922768473625183, + "kl": 0.004820004105567932, + "learning_rate": 4.983955090077445e-06, + "loss": -0.3134, + "num_tokens": 370599.0, + "reward": 0.06875000149011612, + "reward_std": 0.2901986241340637, + "rewards/reward_func/mean": 0.06875000149011612, + "rewards/reward_func/std": 0.379301518201828, + "sampling/importance_sampling_ratio/max": 2.629911184310913, + "sampling/importance_sampling_ratio/mean": 1.3608753681182861, + "sampling/importance_sampling_ratio/min": 0.3729094862937927, + "sampling/sampling_logp_difference/max": 0.6287485361099243, + "sampling/sampling_logp_difference/mean": 0.025368856266140938, + "step": 66, + "step_time": 37.84037004499987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.307640016078949, + "epoch": 0.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7610776424407959, + "kl": 0.0031319891568273306, + "learning_rate": 4.983026193907962e-06, + "loss": 0.0798, + "num_tokens": 375798.0, + "reward": 0.46000000834465027, + "reward_std": 0.5894244313240051, + "rewards/reward_func/mean": 0.46000000834465027, + "rewards/reward_func/std": 0.5463646054267883, + "sampling/importance_sampling_ratio/max": 1.1422264575958252, + "sampling/importance_sampling_ratio/mean": 0.8120383024215698, + "sampling/importance_sampling_ratio/min": 0.4197941720485687, + "sampling/sampling_logp_difference/max": 0.45195698738098145, + "sampling/sampling_logp_difference/mean": 0.023039013147354126, + "step": 67, + "step_time": 30.603469858993776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 56.75, + "completions/mean_terminated_length": 56.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.36541643738746643, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7111493349075317, + "kl": 0.007591226138174534, + "learning_rate": 4.982071251968653e-06, + "loss": 0.1488, + "num_tokens": 381215.0, + "reward": 0.18000000715255737, + "reward_std": 0.33846431970596313, + "rewards/reward_func/mean": 0.18000000715255737, + "rewards/reward_func/std": 0.4910629987716675, + "sampling/importance_sampling_ratio/max": 1.9301010370254517, + "sampling/importance_sampling_ratio/mean": 1.1310442686080933, + "sampling/importance_sampling_ratio/min": 0.2971019148826599, + "sampling/sampling_logp_difference/max": 0.3519878387451172, + "sampling/sampling_logp_difference/mean": 0.025238394737243652, + "step": 68, + "step_time": 36.97006548898935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3671402633190155, + "epoch": 0.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2564339637756348, + "kl": 0.004587341565638781, + "learning_rate": 4.981090274276406e-06, + "loss": 0.0895, + "num_tokens": 387032.0, + "reward": 0.3462499976158142, + "reward_std": 0.2717037796974182, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5316265821456909, + "sampling/importance_sampling_ratio/max": 1.0020262002944946, + "sampling/importance_sampling_ratio/mean": 0.821398913860321, + "sampling/importance_sampling_ratio/min": 0.551517903804779, + "sampling/sampling_logp_difference/max": 0.5747532844543457, + "sampling/sampling_logp_difference/mean": 0.02456159144639969, + "step": 69, + "step_time": 37.778685440003756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.2950865924358368, + "epoch": 0.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7871184349060059, + "kl": 0.00491219712421298, + "learning_rate": 4.980083271121215e-06, + "loss": -0.0193, + "num_tokens": 392609.0, + "reward": 0.45500004291534424, + "reward_std": 0.5168381333351135, + "rewards/reward_func/mean": 0.45500004291534424, + "rewards/reward_func/std": 0.550713837146759, + "sampling/importance_sampling_ratio/max": 1.3376414775848389, + "sampling/importance_sampling_ratio/mean": 0.9009042978286743, + "sampling/importance_sampling_ratio/min": 0.5153571963310242, + "sampling/sampling_logp_difference/max": 0.4256160259246826, + "sampling/sampling_logp_difference/mean": 0.021552588790655136, + "step": 70, + "step_time": 33.37109695599065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.36967194080352783, + "epoch": 0.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4737831354141235, + "kl": 0.01288725808262825, + "learning_rate": 4.979050253066064e-06, + "loss": -0.0912, + "num_tokens": 398608.0, + "reward": 0.20875000953674316, + "reward_std": 0.2906484007835388, + "rewards/reward_func/mean": 0.20875000953674316, + "rewards/reward_func/std": 0.47351083159446716, + "sampling/importance_sampling_ratio/max": 1.5353319644927979, + "sampling/importance_sampling_ratio/mean": 1.0639885663986206, + "sampling/importance_sampling_ratio/min": 0.7201797962188721, + "sampling/sampling_logp_difference/max": 0.34710121154785156, + "sampling/sampling_logp_difference/mean": 0.025040332227945328, + "step": 71, + "step_time": 39.80532811000012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3445550203323364, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8660765290260315, + "kl": 0.005788044538348913, + "learning_rate": 4.977991230946824e-06, + "loss": 0.0375, + "num_tokens": 403626.0, + "reward": 0.18250001966953278, + "reward_std": 0.3195389211177826, + "rewards/reward_func/mean": 0.18250001966953278, + "rewards/reward_func/std": 0.4660395383834839, + "sampling/importance_sampling_ratio/max": 1.5179165601730347, + "sampling/importance_sampling_ratio/mean": 1.0042223930358887, + "sampling/importance_sampling_ratio/min": 0.5119208693504333, + "sampling/sampling_logp_difference/max": 0.47091197967529297, + "sampling/sampling_logp_difference/mean": 0.02298400178551674, + "step": 72, + "step_time": 36.35284370899899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3828085660934448, + "epoch": 0.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.972751498222351, + "kl": 0.015708088874816895, + "learning_rate": 4.976906215872137e-06, + "loss": -0.1855, + "num_tokens": 409741.0, + "reward": 0.581250011920929, + "reward_std": 0.5518749952316284, + "rewards/reward_func/mean": 0.581250011920929, + "rewards/reward_func/std": 0.531801164150238, + "sampling/importance_sampling_ratio/max": 2.6650352478027344, + "sampling/importance_sampling_ratio/mean": 1.5247585773468018, + "sampling/importance_sampling_ratio/min": 0.9828287363052368, + "sampling/sampling_logp_difference/max": 0.5306482315063477, + "sampling/sampling_logp_difference/mean": 0.028494730591773987, + "step": 73, + "step_time": 36.31543107799371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3141552209854126, + "epoch": 0.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3385131359100342, + "kl": 0.008036404848098755, + "learning_rate": 4.975795219223299e-06, + "loss": 0.0508, + "num_tokens": 415116.0, + "reward": 0.07625000178813934, + "reward_std": 0.27722302079200745, + "rewards/reward_func/mean": 0.07625000178813934, + "rewards/reward_func/std": 0.37408700585365295, + "sampling/importance_sampling_ratio/max": 1.733062505722046, + "sampling/importance_sampling_ratio/mean": 1.0138859748840332, + "sampling/importance_sampling_ratio/min": 0.45135366916656494, + "sampling/sampling_logp_difference/max": 0.582321286201477, + "sampling/sampling_logp_difference/mean": 0.020871151238679886, + "step": 74, + "step_time": 43.80073598799936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 54.875, + "completions/mean_terminated_length": 54.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3232402801513672, + "epoch": 0.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7976144552230835, + "kl": 0.006514251232147217, + "learning_rate": 4.974658252654135e-06, + "loss": 0.0469, + "num_tokens": 420518.0, + "reward": 0.20875000953674316, + "reward_std": 0.5280059576034546, + "rewards/reward_func/mean": 0.20875000953674316, + "rewards/reward_func/std": 0.4891519546508789, + "sampling/importance_sampling_ratio/max": 0.8984686136245728, + "sampling/importance_sampling_ratio/mean": 0.6726990938186646, + "sampling/importance_sampling_ratio/min": 0.3876783549785614, + "sampling/sampling_logp_difference/max": 0.6330904960632324, + "sampling/sampling_logp_difference/mean": 0.026919633150100708, + "step": 75, + "step_time": 35.85063866600103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3804740905761719, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9046527743339539, + "kl": 0.0029437714256346226, + "learning_rate": 4.973495328090891e-06, + "loss": -0.0522, + "num_tokens": 425455.0, + "reward": 0.5950000286102295, + "reward_std": 0.5526574850082397, + "rewards/reward_func/mean": 0.5950000286102295, + "rewards/reward_func/std": 0.5316550731658936, + "sampling/importance_sampling_ratio/max": 1.0966424942016602, + "sampling/importance_sampling_ratio/mean": 0.7680986523628235, + "sampling/importance_sampling_ratio/min": 0.4954715371131897, + "sampling/sampling_logp_difference/max": 0.29123687744140625, + "sampling/sampling_logp_difference/mean": 0.023649394512176514, + "step": 76, + "step_time": 30.126132238001446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.335227370262146, + "epoch": 0.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8402533531188965, + "kl": 0.012151396833360195, + "learning_rate": 4.972306457732091e-06, + "loss": -0.1083, + "num_tokens": 430739.0, + "reward": 0.32999998331069946, + "reward_std": 0.30979883670806885, + "rewards/reward_func/mean": 0.32999998331069946, + "rewards/reward_func/std": 0.5463384985923767, + "sampling/importance_sampling_ratio/max": 1.4036647081375122, + "sampling/importance_sampling_ratio/mean": 0.8151348829269409, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9675121307373047, + "sampling/sampling_logp_difference/mean": 0.02233710139989853, + "step": 77, + "step_time": 33.453950964001706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34430697560310364, + "epoch": 0.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8867908716201782, + "kl": 0.0038880002684891224, + "learning_rate": 4.971091654048427e-06, + "loss": -0.1323, + "num_tokens": 437126.0, + "reward": 0.21124999225139618, + "reward_std": 0.5004571676254272, + "rewards/reward_func/mean": 0.21124999225139618, + "rewards/reward_func/std": 0.46366357803344727, + "sampling/importance_sampling_ratio/max": 1.108424186706543, + "sampling/importance_sampling_ratio/mean": 0.829704999923706, + "sampling/importance_sampling_ratio/min": 0.47837772965431213, + "sampling/sampling_logp_difference/max": 0.3985975682735443, + "sampling/sampling_logp_difference/mean": 0.020493805408477783, + "step": 78, + "step_time": 40.64764082799957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3588639497756958, + "epoch": 0.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0783358812332153, + "kl": 0.005843465216457844, + "learning_rate": 4.96985092978261e-06, + "loss": -0.0939, + "num_tokens": 442120.0, + "reward": 0.32875001430511475, + "reward_std": 0.5790164470672607, + "rewards/reward_func/mean": 0.32875001430511475, + "rewards/reward_func/std": 0.5529256463050842, + "sampling/importance_sampling_ratio/max": 1.6267454624176025, + "sampling/importance_sampling_ratio/mean": 0.9230892658233643, + "sampling/importance_sampling_ratio/min": 0.31369680166244507, + "sampling/sampling_logp_difference/max": 0.3530135154724121, + "sampling/sampling_logp_difference/mean": 0.025078624486923218, + "step": 79, + "step_time": 32.59568661899539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 45.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.35594335198402405, + "epoch": 0.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.645215630531311, + "kl": 0.004302887246012688, + "learning_rate": 4.968584297949255e-06, + "loss": -0.2434, + "num_tokens": 447668.0, + "reward": 0.29624998569488525, + "reward_std": 0.5543216466903687, + "rewards/reward_func/mean": 0.29624998569488525, + "rewards/reward_func/std": 0.5293645262718201, + "sampling/importance_sampling_ratio/max": 1.6743134260177612, + "sampling/importance_sampling_ratio/mean": 1.0188958644866943, + "sampling/importance_sampling_ratio/min": 0.6892949938774109, + "sampling/sampling_logp_difference/max": 0.3438667058944702, + "sampling/sampling_logp_difference/mean": 0.02349107712507248, + "step": 80, + "step_time": 41.75027698998747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 44.625, + "completions/mean_terminated_length": 44.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.40325072407722473, + "epoch": 0.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9571352601051331, + "kl": 0.004008749034255743, + "learning_rate": 4.967291771834727e-06, + "loss": 0.0373, + "num_tokens": 453098.0, + "reward": 0.17500001192092896, + "reward_std": 0.34363842010498047, + "rewards/reward_func/mean": 0.17500001192092896, + "rewards/reward_func/std": 0.5028774738311768, + "sampling/importance_sampling_ratio/max": 1.3233540058135986, + "sampling/importance_sampling_ratio/mean": 0.8455219268798828, + "sampling/importance_sampling_ratio/min": 0.36903640627861023, + "sampling/sampling_logp_difference/max": 0.3818695545196533, + "sampling/sampling_logp_difference/mean": 0.03359740972518921, + "step": 81, + "step_time": 40.28149596700678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.32033318281173706, + "epoch": 0.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9337257146835327, + "kl": 0.011265389621257782, + "learning_rate": 4.965973364997015e-06, + "loss": 0.0583, + "num_tokens": 459142.0, + "reward": 0.20500001311302185, + "reward_std": 0.3244031071662903, + "rewards/reward_func/mean": 0.20500001311302185, + "rewards/reward_func/std": 0.49210917949676514, + "sampling/importance_sampling_ratio/max": 1.5231561660766602, + "sampling/importance_sampling_ratio/mean": 0.8062511682510376, + "sampling/importance_sampling_ratio/min": 0.09447702020406723, + "sampling/sampling_logp_difference/max": 1.0004373788833618, + "sampling/sampling_logp_difference/mean": 0.027456309646368027, + "step": 82, + "step_time": 36.980090021010255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3576924800872803, + "epoch": 0.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1017056703567505, + "kl": 0.009185336530208588, + "learning_rate": 4.964629091265583e-06, + "loss": 0.0681, + "num_tokens": 464285.0, + "reward": 0.3387500047683716, + "reward_std": 0.5671484470367432, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.54511958360672, + "sampling/importance_sampling_ratio/max": 1.5919902324676514, + "sampling/importance_sampling_ratio/mean": 0.9405478835105896, + "sampling/importance_sampling_ratio/min": 0.2894143760204315, + "sampling/sampling_logp_difference/max": 0.8768386840820312, + "sampling/sampling_logp_difference/mean": 0.028965311124920845, + "step": 83, + "step_time": 28.138020016995142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 54.875, + "completions/mean_terminated_length": 54.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3319105803966522, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.053510069847107, + "kl": 0.0065056635066866875, + "learning_rate": 4.963258964741227e-06, + "loss": -0.1728, + "num_tokens": 469551.0, + "reward": 0.08000000566244125, + "reward_std": 0.2886171042919159, + "rewards/reward_func/mean": 0.08000000566244125, + "rewards/reward_func/std": 0.3737073242664337, + "sampling/importance_sampling_ratio/max": 1.9180289506912231, + "sampling/importance_sampling_ratio/mean": 1.1588903665542603, + "sampling/importance_sampling_ratio/min": 0.5762325525283813, + "sampling/sampling_logp_difference/max": 0.4678354263305664, + "sampling/sampling_logp_difference/mean": 0.024262480437755585, + "step": 84, + "step_time": 37.53115506299946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3720715641975403, + "epoch": 0.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3156875371932983, + "kl": 0.08773906528949738, + "learning_rate": 4.961862999795923e-06, + "loss": 0.0025, + "num_tokens": 475505.0, + "reward": 0.4750000238418579, + "reward_std": 0.5239638686180115, + "rewards/reward_func/mean": 0.4750000238418579, + "rewards/reward_func/std": 0.5539726614952087, + "sampling/importance_sampling_ratio/max": 1.7880008220672607, + "sampling/importance_sampling_ratio/mean": 0.9351547956466675, + "sampling/importance_sampling_ratio/min": 0.23688003420829773, + "sampling/sampling_logp_difference/max": 1.3453662395477295, + "sampling/sampling_logp_difference/mean": 0.02895110286772251, + "step": 85, + "step_time": 35.246074732000125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.32649070024490356, + "epoch": 0.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0877724885940552, + "kl": 0.009383026510477066, + "learning_rate": 4.960441211072686e-06, + "loss": 0.1414, + "num_tokens": 480647.0, + "reward": 0.07375000417232513, + "reward_std": 0.27671927213668823, + "rewards/reward_func/mean": 0.07375000417232513, + "rewards/reward_func/std": 0.3709038197994232, + "sampling/importance_sampling_ratio/max": 1.5382874011993408, + "sampling/importance_sampling_ratio/mean": 1.010871171951294, + "sampling/importance_sampling_ratio/min": 0.32187968492507935, + "sampling/sampling_logp_difference/max": 0.4144246578216553, + "sampling/sampling_logp_difference/mean": 0.01961221918463707, + "step": 86, + "step_time": 33.18983396900876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.33819085359573364, + "epoch": 0.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5221152305603027, + "kl": 0.03121965564787388, + "learning_rate": 4.958993613485406e-06, + "loss": 0.2319, + "num_tokens": 485763.0, + "reward": 0.17625001072883606, + "reward_std": 0.5154582262039185, + "rewards/reward_func/mean": 0.17625001072883606, + "rewards/reward_func/std": 0.47874653339385986, + "sampling/importance_sampling_ratio/max": 1.8117644786834717, + "sampling/importance_sampling_ratio/mean": 1.0830358266830444, + "sampling/importance_sampling_ratio/min": 0.344952791929245, + "sampling/sampling_logp_difference/max": 0.6365394592285156, + "sampling/sampling_logp_difference/mean": 0.02507655695080757, + "step": 87, + "step_time": 38.59099734299525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 57.75, + "completions/mean_terminated_length": 57.75, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.3603730797767639, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.142964482307434, + "kl": 0.007601139135658741, + "learning_rate": 4.957520222218695e-06, + "loss": 0.1548, + "num_tokens": 491817.0, + "reward": 0.08624999970197678, + "reward_std": 0.27129849791526794, + "rewards/reward_func/mean": 0.08624999970197678, + "rewards/reward_func/std": 0.3660186529159546, + "sampling/importance_sampling_ratio/max": 2.3958003520965576, + "sampling/importance_sampling_ratio/mean": 1.1398122310638428, + "sampling/importance_sampling_ratio/min": 0.34555134177207947, + "sampling/sampling_logp_difference/max": 0.3420066833496094, + "sampling/sampling_logp_difference/mean": 0.01923590898513794, + "step": 88, + "step_time": 43.65914840900223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 54.25, + "completions/mean_terminated_length": 54.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.4104352593421936, + "epoch": 0.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8413081169128418, + "kl": 0.010053085163235664, + "learning_rate": 4.956021052727731e-06, + "loss": -0.0215, + "num_tokens": 497655.0, + "reward": 0.3449999988079071, + "reward_std": 0.5602920651435852, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5405552983283997, + "sampling/importance_sampling_ratio/max": 1.5695143938064575, + "sampling/importance_sampling_ratio/mean": 0.9689695835113525, + "sampling/importance_sampling_ratio/min": 0.6118794083595276, + "sampling/sampling_logp_difference/max": 0.7474043369293213, + "sampling/sampling_logp_difference/mean": 0.024773046374320984, + "step": 89, + "step_time": 32.42648905600072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 59.5, + "completions/mean_terminated_length": 59.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.33168306946754456, + "epoch": 0.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.541452169418335, + "kl": 0.0170269925147295, + "learning_rate": 4.954496120738094e-06, + "loss": 0.1195, + "num_tokens": 503146.0, + "reward": 0.4775000214576721, + "reward_std": 0.512241542339325, + "rewards/reward_func/mean": 0.4775000214576721, + "rewards/reward_func/std": 0.5457040071487427, + "sampling/importance_sampling_ratio/max": 1.8365960121154785, + "sampling/importance_sampling_ratio/mean": 1.0645668506622314, + "sampling/importance_sampling_ratio/min": 0.3866049349308014, + "sampling/sampling_logp_difference/max": 0.6471219062805176, + "sampling/sampling_logp_difference/mean": 0.021687893196940422, + "step": 90, + "step_time": 25.5371619570069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.34987396001815796, + "epoch": 0.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8267898559570312, + "kl": 0.009192817844450474, + "learning_rate": 4.952945442245598e-06, + "loss": -0.1841, + "num_tokens": 509047.0, + "reward": 0.21250000596046448, + "reward_std": 0.5125744342803955, + "rewards/reward_func/mean": 0.21250000596046448, + "rewards/reward_func/std": 0.47475558519363403, + "sampling/importance_sampling_ratio/max": 1.1211237907409668, + "sampling/importance_sampling_ratio/mean": 0.6856480836868286, + "sampling/importance_sampling_ratio/min": 0.16603736579418182, + "sampling/sampling_logp_difference/max": 0.8373830318450928, + "sampling/sampling_logp_difference/mean": 0.02722262404859066, + "step": 91, + "step_time": 32.49494073499227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3588416576385498, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3908851146697998, + "kl": 0.04168040677905083, + "learning_rate": 4.951369033516127e-06, + "loss": -0.0976, + "num_tokens": 514627.0, + "reward": 0.45875000953674316, + "reward_std": 0.5896173715591431, + "rewards/reward_func/mean": 0.45875000953674316, + "rewards/reward_func/std": 0.54619300365448, + "sampling/importance_sampling_ratio/max": 1.545922040939331, + "sampling/importance_sampling_ratio/mean": 1.153027057647705, + "sampling/importance_sampling_ratio/min": 0.5263999104499817, + "sampling/sampling_logp_difference/max": 0.7717450857162476, + "sampling/sampling_logp_difference/mean": 0.02575552463531494, + "step": 92, + "step_time": 39.09205864999967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3676631450653076, + "epoch": 0.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2240070104599, + "kl": 0.02092805504798889, + "learning_rate": 4.949766911085461e-06, + "loss": 0.0766, + "num_tokens": 520393.0, + "reward": 0.48000001907348633, + "reward_std": 0.5069187879562378, + "rewards/reward_func/mean": 0.48000001907348633, + "rewards/reward_func/std": 0.5400793552398682, + "sampling/importance_sampling_ratio/max": 2.340593099594116, + "sampling/importance_sampling_ratio/mean": 1.5034531354904175, + "sampling/importance_sampling_ratio/min": 0.6435762643814087, + "sampling/sampling_logp_difference/max": 0.8327808380126953, + "sampling/sampling_logp_difference/mean": 0.023994414135813713, + "step": 93, + "step_time": 37.435129256002256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 59.375, + "completions/mean_terminated_length": 59.375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.4088976979255676, + "epoch": 0.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.024932622909546, + "kl": 0.004795517306774855, + "learning_rate": 4.948139091759108e-06, + "loss": 0.2021, + "num_tokens": 526559.0, + "reward": 0.48625001311302185, + "reward_std": 0.5142859816551208, + "rewards/reward_func/mean": 0.48625001311302185, + "rewards/reward_func/std": 0.5441097021102905, + "sampling/importance_sampling_ratio/max": 1.42252779006958, + "sampling/importance_sampling_ratio/mean": 0.6734194159507751, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.3571433424949646, + "sampling/sampling_logp_difference/mean": 0.02588842436671257, + "step": 94, + "step_time": 38.451151984001626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 53.5, + "completions/mean_terminated_length": 53.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34375160932540894, + "epoch": 0.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9852098226547241, + "kl": 0.00799483060836792, + "learning_rate": 4.946485592612122e-06, + "loss": 0.0547, + "num_tokens": 532321.0, + "reward": 0.22374999523162842, + "reward_std": 0.5145085453987122, + "rewards/reward_func/mean": 0.22374999523162842, + "rewards/reward_func/std": 0.4764433205127716, + "sampling/importance_sampling_ratio/max": 0.9923078417778015, + "sampling/importance_sampling_ratio/mean": 0.7997302412986755, + "sampling/importance_sampling_ratio/min": 0.577215850353241, + "sampling/sampling_logp_difference/max": 0.3057703971862793, + "sampling/sampling_logp_difference/mean": 0.019131341949105263, + "step": 95, + "step_time": 34.67093198900693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.33774328231811523, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1137478351593018, + "kl": 0.009615856222808361, + "learning_rate": 4.944806430988927e-06, + "loss": -0.1692, + "num_tokens": 537647.0, + "reward": 0.048750005662441254, + "reward_std": 0.286370187997818, + "rewards/reward_func/mean": 0.048750005662441254, + "rewards/reward_func/std": 0.378132164478302, + "sampling/importance_sampling_ratio/max": 2.006246328353882, + "sampling/importance_sampling_ratio/mean": 1.1477537155151367, + "sampling/importance_sampling_ratio/min": 0.2995043992996216, + "sampling/sampling_logp_difference/max": 1.0059900283813477, + "sampling/sampling_logp_difference/mean": 0.025709955021739006, + "step": 96, + "step_time": 36.05396345700137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 58.75, + "completions/mean_terminated_length": 58.75, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.3597293794155121, + "epoch": 0.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5075836181640625, + "kl": 0.010133556090295315, + "learning_rate": 4.943101624503133e-06, + "loss": -0.2914, + "num_tokens": 543234.0, + "reward": 0.21250000596046448, + "reward_std": 0.5191043615341187, + "rewards/reward_func/mean": 0.21250000596046448, + "rewards/reward_func/std": 0.4812706708908081, + "sampling/importance_sampling_ratio/max": 1.8020318746566772, + "sampling/importance_sampling_ratio/mean": 1.1152775287628174, + "sampling/importance_sampling_ratio/min": 0.6069132685661316, + "sampling/sampling_logp_difference/max": 0.5707888603210449, + "sampling/sampling_logp_difference/mean": 0.026453383266925812, + "step": 97, + "step_time": 35.73249283900077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3398410677909851, + "epoch": 0.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3996599912643433, + "kl": 0.008223360404372215, + "learning_rate": 4.941371191037353e-06, + "loss": -0.1045, + "num_tokens": 548982.0, + "reward": 0.45249998569488525, + "reward_std": 0.5502059459686279, + "rewards/reward_func/mean": 0.45249998569488525, + "rewards/reward_func/std": 0.5803878307342529, + "sampling/importance_sampling_ratio/max": 1.8260504007339478, + "sampling/importance_sampling_ratio/mean": 1.0000381469726562, + "sampling/importance_sampling_ratio/min": 0.422150194644928, + "sampling/sampling_logp_difference/max": 0.4241912364959717, + "sampling/sampling_logp_difference/mean": 0.023312591016292572, + "step": 98, + "step_time": 38.02977259900945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3524587154388428, + "epoch": 0.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7168464660644531, + "kl": 0.0064958324655890465, + "learning_rate": 4.939615148743017e-06, + "loss": 0.2992, + "num_tokens": 554371.0, + "reward": 0.5700000524520874, + "reward_std": 0.5693867802619934, + "rewards/reward_func/mean": 0.5700000524520874, + "rewards/reward_func/std": 0.5539984703063965, + "sampling/importance_sampling_ratio/max": 1.9318469762802124, + "sampling/importance_sampling_ratio/mean": 1.0261414051055908, + "sampling/importance_sampling_ratio/min": 0.5000623464584351, + "sampling/sampling_logp_difference/max": 0.5306458473205566, + "sampling/sampling_logp_difference/mean": 0.02562127634882927, + "step": 99, + "step_time": 28.632846849999623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 58.75, + "completions/mean_terminated_length": 58.75, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.343850314617157, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1735574007034302, + "kl": 0.012371711432933807, + "learning_rate": 4.937833516040177e-06, + "loss": 0.1179, + "num_tokens": 560801.0, + "reward": 0.32749998569488525, + "reward_std": 0.5631698369979858, + "rewards/reward_func/mean": 0.32749998569488525, + "rewards/reward_func/std": 0.5443934798240662, + "sampling/importance_sampling_ratio/max": 1.9959181547164917, + "sampling/importance_sampling_ratio/mean": 1.3071322441101074, + "sampling/importance_sampling_ratio/min": 0.6965984106063843, + "sampling/sampling_logp_difference/max": 0.5585286617279053, + "sampling/sampling_logp_difference/mean": 0.018983395770192146, + "step": 100, + "step_time": 39.44753839400073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.35799241065979004, + "epoch": 0.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0344713926315308, + "kl": 0.014730488881468773, + "learning_rate": 4.936026311617316e-06, + "loss": 0.2868, + "num_tokens": 566165.0, + "reward": 0.07999999821186066, + "reward_std": 0.2884673476219177, + "rewards/reward_func/mean": 0.07999999821186066, + "rewards/reward_func/std": 0.37599390745162964, + "sampling/importance_sampling_ratio/max": 1.5769941806793213, + "sampling/importance_sampling_ratio/mean": 0.9793978333473206, + "sampling/importance_sampling_ratio/min": 0.3244423568248749, + "sampling/sampling_logp_difference/max": 0.5661906003952026, + "sampling/sampling_logp_difference/mean": 0.027164215222001076, + "step": 101, + "step_time": 36.72873655399599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.34581878781318665, + "epoch": 0.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3470356464385986, + "kl": 0.0077892690896987915, + "learning_rate": 4.9341935544311536e-06, + "loss": -0.1361, + "num_tokens": 570990.0, + "reward": 0.3500000238418579, + "reward_std": 0.5515884160995483, + "rewards/reward_func/mean": 0.3500000238418579, + "rewards/reward_func/std": 0.5307945609092712, + "sampling/importance_sampling_ratio/max": 1.5632634162902832, + "sampling/importance_sampling_ratio/mean": 0.9828509092330933, + "sampling/importance_sampling_ratio/min": 0.5150687098503113, + "sampling/sampling_logp_difference/max": 0.46799755096435547, + "sampling/sampling_logp_difference/mean": 0.0202273391187191, + "step": 102, + "step_time": 24.58553309200215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.33340394496917725, + "epoch": 0.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9854996800422668, + "kl": 0.007550486363470554, + "learning_rate": 4.932335263706446e-06, + "loss": -0.1328, + "num_tokens": 576939.0, + "reward": 0.19999998807907104, + "reward_std": 0.5018428564071655, + "rewards/reward_func/mean": 0.19999998807907104, + "rewards/reward_func/std": 0.4653416574001312, + "sampling/importance_sampling_ratio/max": 2.2604258060455322, + "sampling/importance_sampling_ratio/mean": 0.915320098400116, + "sampling/importance_sampling_ratio/min": 0.19731059670448303, + "sampling/sampling_logp_difference/max": 1.3153386116027832, + "sampling/sampling_logp_difference/mean": 0.024587368592619896, + "step": 103, + "step_time": 37.690257228998234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3149360716342926, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9567559361457825, + "kl": 0.017881600186228752, + "learning_rate": 4.930451458935783e-06, + "loss": -0.2071, + "num_tokens": 581824.0, + "reward": 0.20374999940395355, + "reward_std": 0.5112870335578918, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.4740384817123413, + "sampling/importance_sampling_ratio/max": 1.1789283752441406, + "sampling/importance_sampling_ratio/mean": 0.6693528890609741, + "sampling/importance_sampling_ratio/min": 0.29736894369125366, + "sampling/sampling_logp_difference/max": 0.6565618515014648, + "sampling/sampling_logp_difference/mean": 0.027554277330636978, + "step": 104, + "step_time": 30.336770003996207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3541415333747864, + "epoch": 0.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6499571800231934, + "kl": 0.019035879522562027, + "learning_rate": 4.928542159879386e-06, + "loss": -0.2454, + "num_tokens": 586973.0, + "reward": 0.29624998569488525, + "reward_std": 0.5775634050369263, + "rewards/reward_func/mean": 0.29624998569488525, + "rewards/reward_func/std": 0.5488412976264954, + "sampling/importance_sampling_ratio/max": 1.7103009223937988, + "sampling/importance_sampling_ratio/mean": 1.2459993362426758, + "sampling/importance_sampling_ratio/min": 0.36490964889526367, + "sampling/sampling_logp_difference/max": 0.5930310487747192, + "sampling/sampling_logp_difference/mean": 0.02690104767680168, + "step": 105, + "step_time": 37.13503508499707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.38179636001586914, + "epoch": 0.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3703417778015137, + "kl": 0.07741641998291016, + "learning_rate": 4.926607386564898e-06, + "loss": -0.1311, + "num_tokens": 592270.0, + "reward": 0.21875, + "reward_std": 0.296247273683548, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.4547664523124695, + "sampling/importance_sampling_ratio/max": 2.1035547256469727, + "sampling/importance_sampling_ratio/mean": 0.9159399271011353, + "sampling/importance_sampling_ratio/min": 0.3674011826515198, + "sampling/sampling_logp_difference/max": 0.7901006937026978, + "sampling/sampling_logp_difference/mean": 0.030587412416934967, + "step": 106, + "step_time": 35.471169879994704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3480619192123413, + "epoch": 0.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0856449604034424, + "kl": 0.01090352050960064, + "learning_rate": 4.924647159287176e-06, + "loss": -0.1514, + "num_tokens": 597816.0, + "reward": 0.19499999284744263, + "reward_std": 0.5200541615486145, + "rewards/reward_func/mean": 0.19499999284744263, + "rewards/reward_func/std": 0.48373547196388245, + "sampling/importance_sampling_ratio/max": 1.8715648651123047, + "sampling/importance_sampling_ratio/mean": 0.986532986164093, + "sampling/importance_sampling_ratio/min": 0.3344648778438568, + "sampling/sampling_logp_difference/max": 0.6495161056518555, + "sampling/sampling_logp_difference/mean": 0.025133948773145676, + "step": 107, + "step_time": 37.98941664501035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 49.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.33404314517974854, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1951714754104614, + "kl": 0.009612706489861012, + "learning_rate": 4.922661498608077e-06, + "loss": -0.0089, + "num_tokens": 602998.0, + "reward": 0.20750001072883606, + "reward_std": 0.3204679489135742, + "rewards/reward_func/mean": 0.20750001072883606, + "rewards/reward_func/std": 0.4872298240661621, + "sampling/importance_sampling_ratio/max": 1.3054614067077637, + "sampling/importance_sampling_ratio/mean": 0.8988068103790283, + "sampling/importance_sampling_ratio/min": 0.4151885509490967, + "sampling/sampling_logp_difference/max": 0.5403620004653931, + "sampling/sampling_logp_difference/mean": 0.023432891815900803, + "step": 108, + "step_time": 40.85303926198685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.30731695890426636, + "epoch": 0.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.472243070602417, + "kl": 0.015070393681526184, + "learning_rate": 4.920650425356239e-06, + "loss": -0.2328, + "num_tokens": 608265.0, + "reward": 0.4625000059604645, + "reward_std": 0.5839909315109253, + "rewards/reward_func/mean": 0.4625000059604645, + "rewards/reward_func/std": 0.5412881970405579, + "sampling/importance_sampling_ratio/max": 2.572606086730957, + "sampling/importance_sampling_ratio/mean": 1.3286433219909668, + "sampling/importance_sampling_ratio/min": 0.5425832271575928, + "sampling/sampling_logp_difference/max": 0.5253163576126099, + "sampling/sampling_logp_difference/mean": 0.023545201867818832, + "step": 109, + "step_time": 35.24712469300721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.32885903120040894, + "epoch": 0.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0502865314483643, + "kl": 0.033357176929712296, + "learning_rate": 4.9186139606268735e-06, + "loss": 0.1122, + "num_tokens": 613625.0, + "reward": 0.4650000035762787, + "reward_std": 0.5683454871177673, + "rewards/reward_func/mean": 0.4650000035762787, + "rewards/reward_func/std": 0.5274466872215271, + "sampling/importance_sampling_ratio/max": 1.862295150756836, + "sampling/importance_sampling_ratio/mean": 0.8837443590164185, + "sampling/importance_sampling_ratio/min": 0.2522660791873932, + "sampling/sampling_logp_difference/max": 0.8349018096923828, + "sampling/sampling_logp_difference/mean": 0.02544923685491085, + "step": 110, + "step_time": 31.10387792700203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.36214321851730347, + "epoch": 0.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5155655145645142, + "kl": 0.006974513176828623, + "learning_rate": 4.916552125781529e-06, + "loss": 0.0182, + "num_tokens": 619329.0, + "reward": 0.44875001907348633, + "reward_std": 0.5807082653045654, + "rewards/reward_func/mean": 0.44875001907348633, + "rewards/reward_func/std": 0.5378113985061646, + "sampling/importance_sampling_ratio/max": 2.2787792682647705, + "sampling/importance_sampling_ratio/mean": 1.3124430179595947, + "sampling/importance_sampling_ratio/min": 0.5105971097946167, + "sampling/sampling_logp_difference/max": 0.5723431706428528, + "sampling/sampling_logp_difference/mean": 0.02273915708065033, + "step": 111, + "step_time": 39.27943245699862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.35876524448394775, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9791905879974365, + "kl": 0.02388225495815277, + "learning_rate": 4.9144649424478765e-06, + "loss": -0.2625, + "num_tokens": 624851.0, + "reward": 0.07499999552965164, + "reward_std": 0.2920842170715332, + "rewards/reward_func/mean": 0.07499999552965164, + "rewards/reward_func/std": 0.37305688858032227, + "sampling/importance_sampling_ratio/max": 1.6145038604736328, + "sampling/importance_sampling_ratio/mean": 0.7806305885314941, + "sampling/importance_sampling_ratio/min": 0.31658169627189636, + "sampling/sampling_logp_difference/max": 0.9168522357940674, + "sampling/sampling_logp_difference/mean": 0.031929485499858856, + "step": 112, + "step_time": 40.077556558011565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 57.25, + "completions/mean_terminated_length": 57.25, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.39105433225631714, + "epoch": 0.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3499541282653809, + "kl": 0.03188120573759079, + "learning_rate": 4.912352432519484e-06, + "loss": -0.2043, + "num_tokens": 630276.0, + "reward": 0.2224999964237213, + "reward_std": 0.5185043811798096, + "rewards/reward_func/mean": 0.2224999964237213, + "rewards/reward_func/std": 0.48052799701690674, + "sampling/importance_sampling_ratio/max": 1.5414526462554932, + "sampling/importance_sampling_ratio/mean": 1.035444736480713, + "sampling/importance_sampling_ratio/min": 0.3080333471298218, + "sampling/sampling_logp_difference/max": 0.841944694519043, + "sampling/sampling_logp_difference/mean": 0.025317739695310593, + "step": 113, + "step_time": 33.08625747299811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.35971030592918396, + "epoch": 0.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4594817161560059, + "kl": 0.017640622332692146, + "learning_rate": 4.910214618155579e-06, + "loss": 0.1665, + "num_tokens": 636074.0, + "reward": 0.3162499964237213, + "reward_std": 0.5943635106086731, + "rewards/reward_func/mean": 0.3162499964237213, + "rewards/reward_func/std": 0.5652796626091003, + "sampling/importance_sampling_ratio/max": 2.2878103256225586, + "sampling/importance_sampling_ratio/mean": 1.1907858848571777, + "sampling/importance_sampling_ratio/min": 0.431149423122406, + "sampling/sampling_logp_difference/max": 0.8824386596679688, + "sampling/sampling_logp_difference/mean": 0.023232053965330124, + "step": 114, + "step_time": 34.20750616800797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35053420066833496, + "epoch": 0.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8730654120445251, + "kl": 0.10532344877719879, + "learning_rate": 4.908051521780824e-06, + "loss": -0.0439, + "num_tokens": 641976.0, + "reward": 0.3449999988079071, + "reward_std": 0.5371800661087036, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5183214545249939, + "sampling/importance_sampling_ratio/max": 1.1692423820495605, + "sampling/importance_sampling_ratio/mean": 0.8554747700691223, + "sampling/importance_sampling_ratio/min": 0.27807939052581787, + "sampling/sampling_logp_difference/max": 1.0537323951721191, + "sampling/sampling_logp_difference/mean": 0.028338592499494553, + "step": 115, + "step_time": 39.78751064000244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 56.125, + "completions/mean_terminated_length": 56.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.34610265493392944, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7445147633552551, + "kl": 0.012357879430055618, + "learning_rate": 4.905863166085076e-06, + "loss": -0.1522, + "num_tokens": 647397.0, + "reward": -0.04374999925494194, + "reward_std": 0.044737864285707474, + "rewards/reward_func/mean": -0.04374999925494194, + "rewards/reward_func/std": 0.04340424761176109, + "sampling/importance_sampling_ratio/max": 1.5308575630187988, + "sampling/importance_sampling_ratio/mean": 0.9485741853713989, + "sampling/importance_sampling_ratio/min": 0.4332679510116577, + "sampling/sampling_logp_difference/max": 0.3149690628051758, + "sampling/sampling_logp_difference/mean": 0.021136745810508728, + "step": 116, + "step_time": 34.7742198099877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.31725019216537476, + "epoch": 0.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.139970302581787, + "kl": 0.019606376066803932, + "learning_rate": 4.903649574023151e-06, + "loss": -0.105, + "num_tokens": 653878.0, + "reward": 0.17999999225139618, + "reward_std": 0.5323570966720581, + "rewards/reward_func/mean": 0.17999999225139618, + "rewards/reward_func/std": 0.49361640214920044, + "sampling/importance_sampling_ratio/max": 1.8025496006011963, + "sampling/importance_sampling_ratio/mean": 1.0302600860595703, + "sampling/importance_sampling_ratio/min": 0.5091882944107056, + "sampling/sampling_logp_difference/max": 0.5300393104553223, + "sampling/sampling_logp_difference/mean": 0.022776808589696884, + "step": 117, + "step_time": 40.60955931300123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3315849006175995, + "epoch": 0.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.014474868774414, + "kl": 0.025705253705382347, + "learning_rate": 4.901410768814581e-06, + "loss": 0.2675, + "num_tokens": 660043.0, + "reward": 0.05625000223517418, + "reward_std": 0.285952627658844, + "rewards/reward_func/mean": 0.05625000223517418, + "rewards/reward_func/std": 0.36146280169487, + "sampling/importance_sampling_ratio/max": 2.1567413806915283, + "sampling/importance_sampling_ratio/mean": 0.9692880511283875, + "sampling/importance_sampling_ratio/min": 0.07723711431026459, + "sampling/sampling_logp_difference/max": 0.8733996152877808, + "sampling/sampling_logp_difference/mean": 0.03249724209308624, + "step": 118, + "step_time": 38.65960728799109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3267901837825775, + "epoch": 0.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1987251043319702, + "kl": 0.03742775321006775, + "learning_rate": 4.899146773943374e-06, + "loss": -0.582, + "num_tokens": 665037.0, + "reward": 0.3512499928474426, + "reward_std": 0.5559871196746826, + "rewards/reward_func/mean": 0.3512499928474426, + "rewards/reward_func/std": 0.5351218581199646, + "sampling/importance_sampling_ratio/max": 1.7580586671829224, + "sampling/importance_sampling_ratio/mean": 0.7292503714561462, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.458031415939331, + "sampling/sampling_logp_difference/mean": 0.029961854219436646, + "step": 119, + "step_time": 26.94828627500101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3302837014198303, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7964790463447571, + "kl": 0.018735038116574287, + "learning_rate": 4.896857613157765e-06, + "loss": 0.0465, + "num_tokens": 670757.0, + "reward": 0.09125000238418579, + "reward_std": 0.27616971731185913, + "rewards/reward_func/mean": 0.09125000238418579, + "rewards/reward_func/std": 0.36876001954078674, + "sampling/importance_sampling_ratio/max": 1.618713617324829, + "sampling/importance_sampling_ratio/mean": 0.8206771612167358, + "sampling/importance_sampling_ratio/min": 0.19846399128437042, + "sampling/sampling_logp_difference/max": 1.0061447620391846, + "sampling/sampling_logp_difference/mean": 0.023636985570192337, + "step": 120, + "step_time": 36.12980112100195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.30235159397125244, + "epoch": 0.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8383305072784424, + "kl": 0.027072228491306305, + "learning_rate": 4.894543310469968e-06, + "loss": -0.0366, + "num_tokens": 676152.0, + "reward": 0.20125000178813934, + "reward_std": 0.5118378400802612, + "rewards/reward_func/mean": 0.20125000178813934, + "rewards/reward_func/std": 0.4741138517856598, + "sampling/importance_sampling_ratio/max": 1.6411762237548828, + "sampling/importance_sampling_ratio/mean": 0.9534869194030762, + "sampling/importance_sampling_ratio/min": 0.41311678290367126, + "sampling/sampling_logp_difference/max": 1.0102167129516602, + "sampling/sampling_logp_difference/mean": 0.02292507141828537, + "step": 121, + "step_time": 33.572335048011155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3367387056350708, + "epoch": 0.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0167452096939087, + "kl": 0.06418883055448532, + "learning_rate": 4.8922038901559225e-06, + "loss": -0.0048, + "num_tokens": 682101.0, + "reward": 0.0650000050663948, + "reward_std": 0.28377196192741394, + "rewards/reward_func/mean": 0.0650000050663948, + "rewards/reward_func/std": 0.3791343569755554, + "sampling/importance_sampling_ratio/max": 1.28421950340271, + "sampling/importance_sampling_ratio/mean": 0.7725279331207275, + "sampling/importance_sampling_ratio/min": 0.1273626983165741, + "sampling/sampling_logp_difference/max": 1.543129801750183, + "sampling/sampling_logp_difference/mean": 0.02904380112886429, + "step": 122, + "step_time": 39.409893271003966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 46.625, + "completions/mean_terminated_length": 46.625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3044854998588562, + "epoch": 0.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8013125061988831, + "kl": 0.019057054072618484, + "learning_rate": 4.889839376755041e-06, + "loss": -0.1674, + "num_tokens": 688317.0, + "reward": 0.032499998807907104, + "reward_std": 0.2980650067329407, + "rewards/reward_func/mean": 0.032499998807907104, + "rewards/reward_func/std": 0.3873997628688812, + "sampling/importance_sampling_ratio/max": 1.4401540756225586, + "sampling/importance_sampling_ratio/mean": 1.0427517890930176, + "sampling/importance_sampling_ratio/min": 0.5601269006729126, + "sampling/sampling_logp_difference/max": 0.845482587814331, + "sampling/sampling_logp_difference/mean": 0.022582385689020157, + "step": 123, + "step_time": 47.42131155300012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.32158392667770386, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1628981828689575, + "kl": 0.025724977254867554, + "learning_rate": 4.887449795069948e-06, + "loss": 0.0402, + "num_tokens": 694391.0, + "reward": 0.5987499952316284, + "reward_std": 0.2651512026786804, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.5322039723396301, + "sampling/importance_sampling_ratio/max": 2.152245283126831, + "sampling/importance_sampling_ratio/mean": 1.4744114875793457, + "sampling/importance_sampling_ratio/min": 0.2157006412744522, + "sampling/sampling_logp_difference/max": 1.0837020874023438, + "sampling/sampling_logp_difference/mean": 0.02428443171083927, + "step": 124, + "step_time": 41.57415235800727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3349279761314392, + "epoch": 0.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2164572477340698, + "kl": 0.15385711193084717, + "learning_rate": 4.885035170166229e-06, + "loss": -0.225, + "num_tokens": 699998.0, + "reward": 0.4737499952316284, + "reward_std": 0.5991882681846619, + "rewards/reward_func/mean": 0.4737499952316284, + "rewards/reward_func/std": 0.5547441244125366, + "sampling/importance_sampling_ratio/max": 1.5618447065353394, + "sampling/importance_sampling_ratio/mean": 0.9376882314682007, + "sampling/importance_sampling_ratio/min": 0.143229141831398, + "sampling/sampling_logp_difference/max": 2.045600414276123, + "sampling/sampling_logp_difference/mean": 0.02819056063890457, + "step": 125, + "step_time": 36.00430190899351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.33846554160118103, + "epoch": 0.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9615040421485901, + "kl": 0.028729338198900223, + "learning_rate": 4.8825955273721524e-06, + "loss": -0.0762, + "num_tokens": 705652.0, + "reward": 0.3187499940395355, + "reward_std": 0.5555436015129089, + "rewards/reward_func/mean": 0.3187499940395355, + "rewards/reward_func/std": 0.5321770906448364, + "sampling/importance_sampling_ratio/max": 1.5394591093063354, + "sampling/importance_sampling_ratio/mean": 0.9313254356384277, + "sampling/importance_sampling_ratio/min": 0.4095960855484009, + "sampling/sampling_logp_difference/max": 0.9179900884628296, + "sampling/sampling_logp_difference/mean": 0.027478884905576706, + "step": 126, + "step_time": 35.53052958998887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.317068874835968, + "epoch": 0.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.330291509628296, + "kl": 0.018047351390123367, + "learning_rate": 4.88013089227842e-06, + "loss": -0.0729, + "num_tokens": 710818.0, + "reward": 0.21125000715255737, + "reward_std": 0.31445786356925964, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.47555795311927795, + "sampling/importance_sampling_ratio/max": 1.5873545408248901, + "sampling/importance_sampling_ratio/mean": 1.018317461013794, + "sampling/importance_sampling_ratio/min": 0.41594427824020386, + "sampling/sampling_logp_difference/max": 1.0792710781097412, + "sampling/sampling_logp_difference/mean": 0.02153525874018669, + "step": 127, + "step_time": 36.711535330003244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3007584810256958, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8899461030960083, + "kl": 0.050801776349544525, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0411, + "num_tokens": 716809.0, + "reward": -0.03750000149011612, + "reward_std": 0.028086936101317406, + "rewards/reward_func/mean": -0.03750000149011612, + "rewards/reward_func/std": 0.03011881187558174, + "sampling/importance_sampling_ratio/max": 1.389890193939209, + "sampling/importance_sampling_ratio/mean": 0.9472289085388184, + "sampling/importance_sampling_ratio/min": 0.48205605149269104, + "sampling/sampling_logp_difference/max": 0.8377180099487305, + "sampling/sampling_logp_difference/mean": 0.020493997260928154, + "step": 128, + "step_time": 39.203543747993535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.30576610565185547, + "epoch": 0.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9521754384040833, + "kl": 0.053522251546382904, + "learning_rate": 4.87512674886529e-06, + "loss": -0.2217, + "num_tokens": 721985.0, + "reward": 0.20000000298023224, + "reward_std": 0.3328213095664978, + "rewards/reward_func/mean": 0.20000000298023224, + "rewards/reward_func/std": 0.49856939911842346, + "sampling/importance_sampling_ratio/max": 2.0129761695861816, + "sampling/importance_sampling_ratio/mean": 0.9459276795387268, + "sampling/importance_sampling_ratio/min": 0.1618739664554596, + "sampling/sampling_logp_difference/max": 1.4156498908996582, + "sampling/sampling_logp_difference/mean": 0.02784860134124756, + "step": 129, + "step_time": 41.28106502900482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.33031588792800903, + "epoch": 0.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6887200474739075, + "kl": 0.016769763082265854, + "learning_rate": 4.872587293036991e-06, + "loss": 0.2953, + "num_tokens": 728153.0, + "reward": 0.20124998688697815, + "reward_std": 0.47694161534309387, + "rewards/reward_func/mean": 0.20124998688697815, + "rewards/reward_func/std": 0.441925585269928, + "sampling/importance_sampling_ratio/max": 1.7701690196990967, + "sampling/importance_sampling_ratio/mean": 0.8581265211105347, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9351005554199219, + "sampling/sampling_logp_difference/mean": 0.024315927177667618, + "step": 130, + "step_time": 38.647780115003115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.35881704092025757, + "epoch": 0.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5620861053466797, + "kl": 0.01443159207701683, + "learning_rate": 4.870022949890676e-06, + "loss": 0.0968, + "num_tokens": 734055.0, + "reward": 0.5875000357627869, + "reward_std": 0.5394585132598877, + "rewards/reward_func/mean": 0.5875000357627869, + "rewards/reward_func/std": 0.5231702923774719, + "sampling/importance_sampling_ratio/max": 2.211554765701294, + "sampling/importance_sampling_ratio/mean": 1.1617193222045898, + "sampling/importance_sampling_ratio/min": 0.5277535915374756, + "sampling/sampling_logp_difference/max": 0.6931544542312622, + "sampling/sampling_logp_difference/mean": 0.024210434406995773, + "step": 131, + "step_time": 36.056461569998646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.29458680748939514, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.909350574016571, + "kl": 0.1462167501449585, + "learning_rate": 4.867433746325093e-06, + "loss": 0.1241, + "num_tokens": 740181.0, + "reward": -0.029999997466802597, + "reward_std": 0.039329893887043, + "rewards/reward_func/mean": -0.029999997466802597, + "rewards/reward_func/std": 0.03664501756429672, + "sampling/importance_sampling_ratio/max": 2.2302660942077637, + "sampling/importance_sampling_ratio/mean": 0.8643943667411804, + "sampling/importance_sampling_ratio/min": 0.17711393535137177, + "sampling/sampling_logp_difference/max": 1.4919137954711914, + "sampling/sampling_logp_difference/mean": 0.03085586428642273, + "step": 132, + "step_time": 41.68611562899605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 55.125, + "completions/mean_terminated_length": 55.125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.2915087342262268, + "epoch": 0.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1142926216125488, + "kl": 0.030653204768896103, + "learning_rate": 4.864819709499762e-06, + "loss": -0.2321, + "num_tokens": 745522.0, + "reward": 0.3174999952316284, + "reward_std": 0.586236298084259, + "rewards/reward_func/mean": 0.3174999952316284, + "rewards/reward_func/std": 0.562894344329834, + "sampling/importance_sampling_ratio/max": 1.7828896045684814, + "sampling/importance_sampling_ratio/mean": 1.0796490907669067, + "sampling/importance_sampling_ratio/min": 0.3094936013221741, + "sampling/sampling_logp_difference/max": 0.877861499786377, + "sampling/sampling_logp_difference/mean": 0.0238037146627903, + "step": 133, + "step_time": 34.40468131400121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35022756457328796, + "epoch": 0.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6583671569824219, + "kl": 0.21844248473644257, + "learning_rate": 4.862180866834691e-06, + "loss": -0.0494, + "num_tokens": 751260.0, + "reward": 0.4650000035762787, + "reward_std": 0.6151957511901855, + "rewards/reward_func/mean": 0.4650000035762787, + "rewards/reward_func/std": 0.5702630877494812, + "sampling/importance_sampling_ratio/max": 2.757086992263794, + "sampling/importance_sampling_ratio/mean": 1.3203340768814087, + "sampling/importance_sampling_ratio/min": 0.6060196757316589, + "sampling/sampling_logp_difference/max": 0.6190414428710938, + "sampling/sampling_logp_difference/mean": 0.025964640080928802, + "step": 134, + "step_time": 29.936322672001552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3812062740325928, + "epoch": 0.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5890302658081055, + "kl": 0.014861365780234337, + "learning_rate": 4.8595172460100914e-06, + "loss": 0.0607, + "num_tokens": 756304.0, + "reward": 0.03875000402331352, + "reward_std": 0.271657258272171, + "rewards/reward_func/mean": 0.03875000402331352, + "rewards/reward_func/std": 0.35534441471099854, + "sampling/importance_sampling_ratio/max": 2.5916507244110107, + "sampling/importance_sampling_ratio/mean": 0.8855729103088379, + "sampling/importance_sampling_ratio/min": 0.21787041425704956, + "sampling/sampling_logp_difference/max": 0.9328546524047852, + "sampling/sampling_logp_difference/mean": 0.03202846646308899, + "step": 135, + "step_time": 39.908553318004124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3220250606536865, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4078805446624756, + "kl": 0.023584317415952682, + "learning_rate": 4.856828874966086e-06, + "loss": -0.1456, + "num_tokens": 762477.0, + "reward": 0.48750001192092896, + "reward_std": 0.5153918266296387, + "rewards/reward_func/mean": 0.48750001192092896, + "rewards/reward_func/std": 0.5482374429702759, + "sampling/importance_sampling_ratio/max": 2.551933765411377, + "sampling/importance_sampling_ratio/mean": 1.3002712726593018, + "sampling/importance_sampling_ratio/min": 0.884465754032135, + "sampling/sampling_logp_difference/max": 0.5747478008270264, + "sampling/sampling_logp_difference/mean": 0.022638380527496338, + "step": 136, + "step_time": 32.25789829700079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3236207962036133, + "epoch": 0.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9340776205062866, + "kl": 0.03285123407840729, + "learning_rate": 4.854115781902414e-06, + "loss": 0.0131, + "num_tokens": 768391.0, + "reward": 0.21125000715255737, + "reward_std": 0.5204670429229736, + "rewards/reward_func/mean": 0.21125000715255737, + "rewards/reward_func/std": 0.4824472665786743, + "sampling/importance_sampling_ratio/max": 1.7785167694091797, + "sampling/importance_sampling_ratio/mean": 0.8016531467437744, + "sampling/importance_sampling_ratio/min": 0.2458231896162033, + "sampling/sampling_logp_difference/max": 1.4985270500183105, + "sampling/sampling_logp_difference/mean": 0.028025494888424873, + "step": 137, + "step_time": 34.520976280007744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3042897582054138, + "epoch": 0.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.278320550918579, + "kl": 0.015772713348269463, + "learning_rate": 4.851377995278138e-06, + "loss": 0.2867, + "num_tokens": 773957.0, + "reward": 0.19749999046325684, + "reward_std": 0.31783488392829895, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.48643749952316284, + "sampling/importance_sampling_ratio/max": 2.4400241374969482, + "sampling/importance_sampling_ratio/mean": 1.1646664142608643, + "sampling/importance_sampling_ratio/min": 0.5930517911911011, + "sampling/sampling_logp_difference/max": 0.5375218391418457, + "sampling/sampling_logp_difference/mean": 0.023220781236886978, + "step": 138, + "step_time": 37.38816410599975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.35318320989608765, + "epoch": 0.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0766141414642334, + "kl": 0.018960729241371155, + "learning_rate": 4.8486155438113455e-06, + "loss": -0.2235, + "num_tokens": 779691.0, + "reward": 0.2175000011920929, + "reward_std": 0.321079283952713, + "rewards/reward_func/mean": 0.2175000011920929, + "rewards/reward_func/std": 0.48352134227752686, + "sampling/importance_sampling_ratio/max": 1.471369981765747, + "sampling/importance_sampling_ratio/mean": 1.112367868423462, + "sampling/importance_sampling_ratio/min": 0.4520459473133087, + "sampling/sampling_logp_difference/max": 0.5129961967468262, + "sampling/sampling_logp_difference/mean": 0.02581869438290596, + "step": 139, + "step_time": 36.47941235799226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.35746175050735474, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5763007402420044, + "kl": 0.049079518765211105, + "learning_rate": 4.845828456478843e-06, + "loss": 0.1571, + "num_tokens": 785207.0, + "reward": 0.32500001788139343, + "reward_std": 0.2978859543800354, + "rewards/reward_func/mean": 0.32500001788139343, + "rewards/reward_func/std": 0.5395500659942627, + "sampling/importance_sampling_ratio/max": 1.674521803855896, + "sampling/importance_sampling_ratio/mean": 0.9480903148651123, + "sampling/importance_sampling_ratio/min": 0.11278916150331497, + "sampling/sampling_logp_difference/max": 2.2472801208496094, + "sampling/sampling_logp_difference/mean": 0.03470786660909653, + "step": 140, + "step_time": 40.7823389940022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.32371532917022705, + "epoch": 0.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0982533693313599, + "kl": 0.01844196580350399, + "learning_rate": 4.84301676251586e-06, + "loss": 0.1592, + "num_tokens": 790173.0, + "reward": 0.20874999463558197, + "reward_std": 0.30841800570487976, + "rewards/reward_func/mean": 0.20874999463558197, + "rewards/reward_func/std": 0.45920541882514954, + "sampling/importance_sampling_ratio/max": 1.9294414520263672, + "sampling/importance_sampling_ratio/mean": 0.9358152747154236, + "sampling/importance_sampling_ratio/min": 0.3433741331100464, + "sampling/sampling_logp_difference/max": 1.0784287452697754, + "sampling/sampling_logp_difference/mean": 0.024096664041280746, + "step": 141, + "step_time": 38.74317984600202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34515082836151123, + "epoch": 0.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8294340372085571, + "kl": 0.030305206775665283, + "learning_rate": 4.840180491415733e-06, + "loss": -0.1339, + "num_tokens": 795229.0, + "reward": 0.3450000286102295, + "reward_std": 0.5490354299545288, + "rewards/reward_func/mean": 0.3450000286102295, + "rewards/reward_func/std": 0.5287992358207703, + "sampling/importance_sampling_ratio/max": 2.045395612716675, + "sampling/importance_sampling_ratio/mean": 1.216064453125, + "sampling/importance_sampling_ratio/min": 0.5218259692192078, + "sampling/sampling_logp_difference/max": 1.2418723106384277, + "sampling/sampling_logp_difference/mean": 0.03373716026544571, + "step": 142, + "step_time": 31.14222859099391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 54.875, + "completions/mean_terminated_length": 54.875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.38218700885772705, + "epoch": 0.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0369791984558105, + "kl": 0.047582805156707764, + "learning_rate": 4.837319672929606e-06, + "loss": 0.6064, + "num_tokens": 801778.0, + "reward": -0.07625000178813934, + "reward_std": 0.03596387431025505, + "rewards/reward_func/mean": -0.07625000178813934, + "rewards/reward_func/std": 0.04240535944700241, + "sampling/importance_sampling_ratio/max": 2.7870662212371826, + "sampling/importance_sampling_ratio/mean": 1.0109559297561646, + "sampling/importance_sampling_ratio/min": 0.2245939075946808, + "sampling/sampling_logp_difference/max": 0.9906719326972961, + "sampling/sampling_logp_difference/mean": 0.031783588230609894, + "step": 143, + "step_time": 47.368925528993714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 53.125, + "completions/mean_terminated_length": 53.125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3638345003128052, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9337130188941956, + "kl": 0.024956434965133667, + "learning_rate": 4.834434337066112e-06, + "loss": -0.0504, + "num_tokens": 808054.0, + "reward": 0.09000000357627869, + "reward_std": 0.2676810622215271, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.3601190149784088, + "sampling/importance_sampling_ratio/max": 2.723515510559082, + "sampling/importance_sampling_ratio/mean": 1.0728464126586914, + "sampling/importance_sampling_ratio/min": 0.6166611909866333, + "sampling/sampling_logp_difference/max": 0.7025502920150757, + "sampling/sampling_logp_difference/mean": 0.026006463915109634, + "step": 144, + "step_time": 40.9219185700058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3282458782196045, + "epoch": 0.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6762001514434814, + "kl": 0.031979549676179886, + "learning_rate": 4.831524514091056e-06, + "loss": -0.1265, + "num_tokens": 813504.0, + "reward": 0.0624999962747097, + "reward_std": 0.2866820991039276, + "rewards/reward_func/mean": 0.0624999962747097, + "rewards/reward_func/std": 0.36846205592155457, + "sampling/importance_sampling_ratio/max": 1.8622187376022339, + "sampling/importance_sampling_ratio/mean": 1.1275174617767334, + "sampling/importance_sampling_ratio/min": 0.571599543094635, + "sampling/sampling_logp_difference/max": 1.3647043704986572, + "sampling/sampling_logp_difference/mean": 0.02731524407863617, + "step": 145, + "step_time": 38.23023704699881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3222322463989258, + "epoch": 0.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9344398379325867, + "kl": 0.0195845365524292, + "learning_rate": 4.828590234527107e-06, + "loss": 0.0898, + "num_tokens": 818808.0, + "reward": 0.5824999809265137, + "reward_std": 0.5538103580474854, + "rewards/reward_func/mean": 0.5824999809265137, + "rewards/reward_func/std": 0.5363035202026367, + "sampling/importance_sampling_ratio/max": 1.1425395011901855, + "sampling/importance_sampling_ratio/mean": 0.6538075804710388, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.8657850027084351, + "sampling/sampling_logp_difference/mean": 0.02582240290939808, + "step": 146, + "step_time": 30.217266699997708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.31768035888671875, + "epoch": 0.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3994672298431396, + "kl": 0.02365160547196865, + "learning_rate": 4.825631529153466e-06, + "loss": -0.0764, + "num_tokens": 824306.0, + "reward": 0.35750001668930054, + "reward_std": 0.2609923481941223, + "rewards/reward_func/mean": 0.35750001668930054, + "rewards/reward_func/std": 0.5296292901039124, + "sampling/importance_sampling_ratio/max": 1.9504226446151733, + "sampling/importance_sampling_ratio/mean": 1.1939644813537598, + "sampling/importance_sampling_ratio/min": 0.5214744210243225, + "sampling/sampling_logp_difference/max": 0.8518610000610352, + "sampling/sampling_logp_difference/mean": 0.024599412456154823, + "step": 147, + "step_time": 38.60695266799303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.31512904167175293, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0805143117904663, + "kl": 0.021506479009985924, + "learning_rate": 4.8226484290055544e-06, + "loss": 0.2375, + "num_tokens": 830040.0, + "reward": 0.19875000417232513, + "reward_std": 0.5263077020645142, + "rewards/reward_func/mean": 0.19875000417232513, + "rewards/reward_func/std": 0.4873672127723694, + "sampling/importance_sampling_ratio/max": 2.6040730476379395, + "sampling/importance_sampling_ratio/mean": 1.1463204622268677, + "sampling/importance_sampling_ratio/min": 0.38138559460639954, + "sampling/sampling_logp_difference/max": 0.7039591073989868, + "sampling/sampling_logp_difference/mean": 0.025920793414115906, + "step": 148, + "step_time": 34.074096187992836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3225823640823364, + "epoch": 0.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.075644850730896, + "kl": 0.01727544516324997, + "learning_rate": 4.8196409653746815e-06, + "loss": -0.068, + "num_tokens": 835585.0, + "reward": 0.0637499988079071, + "reward_std": 0.251331090927124, + "rewards/reward_func/mean": 0.0637499988079071, + "rewards/reward_func/std": 0.33940860629081726, + "sampling/importance_sampling_ratio/max": 2.514382839202881, + "sampling/importance_sampling_ratio/mean": 1.2466282844543457, + "sampling/importance_sampling_ratio/min": 0.49027585983276367, + "sampling/sampling_logp_difference/max": 0.5779092311859131, + "sampling/sampling_logp_difference/mean": 0.020606372505426407, + "step": 149, + "step_time": 40.79852026500157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 55.125, + "completions/mean_terminated_length": 55.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.34883180260658264, + "epoch": 0.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.851610541343689, + "kl": 0.021933497861027718, + "learning_rate": 4.8166091698077165e-06, + "loss": 0.0618, + "num_tokens": 840529.0, + "reward": 0.20374999940395355, + "reward_std": 0.32920289039611816, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.4870006740093231, + "sampling/importance_sampling_ratio/max": 1.754447102546692, + "sampling/importance_sampling_ratio/mean": 0.8660247921943665, + "sampling/importance_sampling_ratio/min": 0.4203989803791046, + "sampling/sampling_logp_difference/max": 0.9182287454605103, + "sampling/sampling_logp_difference/mean": 0.024133939296007156, + "step": 150, + "step_time": 35.18480104600894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 43.25, + "completions/mean_terminated_length": 43.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.28381434082984924, + "epoch": 0.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7717716693878174, + "kl": 0.01022813469171524, + "learning_rate": 4.813553074106761e-06, + "loss": 0.1029, + "num_tokens": 845485.0, + "reward": 0.20374999940395355, + "reward_std": 0.29958459734916687, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.4509335458278656, + "sampling/importance_sampling_ratio/max": 1.0267828702926636, + "sampling/importance_sampling_ratio/mean": 0.7419992685317993, + "sampling/importance_sampling_ratio/min": 0.43720394372940063, + "sampling/sampling_logp_difference/max": 0.7798802852630615, + "sampling/sampling_logp_difference/mean": 0.024541743099689484, + "step": 151, + "step_time": 34.449168086997815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3197404742240906, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2963764667510986, + "kl": 0.021068178117275238, + "learning_rate": 4.8104727103288125e-06, + "loss": -0.0558, + "num_tokens": 851304.0, + "reward": 0.22374999523162842, + "reward_std": 0.5079156160354614, + "rewards/reward_func/mean": 0.22374999523162842, + "rewards/reward_func/std": 0.47025638818740845, + "sampling/importance_sampling_ratio/max": 1.4703145027160645, + "sampling/importance_sampling_ratio/mean": 1.0212607383728027, + "sampling/importance_sampling_ratio/min": 0.21392318606376648, + "sampling/sampling_logp_difference/max": 1.0519495010375977, + "sampling/sampling_logp_difference/mean": 0.025931382551789284, + "step": 152, + "step_time": 36.7780589239992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 57.75, + "completions/mean_terminated_length": 57.75, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.3580837845802307, + "epoch": 0.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.017104148864746, + "kl": 0.00937014352530241, + "learning_rate": 4.80736811078543e-06, + "loss": 0.0233, + "num_tokens": 856882.0, + "reward": 0.39499998092651367, + "reward_std": 0.6293153166770935, + "rewards/reward_func/mean": 0.39499998092651367, + "rewards/reward_func/std": 0.5829482078552246, + "sampling/importance_sampling_ratio/max": 1.3782010078430176, + "sampling/importance_sampling_ratio/mean": 1.1638000011444092, + "sampling/importance_sampling_ratio/min": 0.8089496493339539, + "sampling/sampling_logp_difference/max": 0.35408473014831543, + "sampling/sampling_logp_difference/mean": 0.018965404480695724, + "step": 153, + "step_time": 36.323780152990366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3402557373046875, + "epoch": 0.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0118770599365234, + "kl": 0.02948238141834736, + "learning_rate": 4.804239308042392e-06, + "loss": 0.1774, + "num_tokens": 862416.0, + "reward": 0.46875, + "reward_std": 0.5176174640655518, + "rewards/reward_func/mean": 0.46875, + "rewards/reward_func/std": 0.560215175151825, + "sampling/importance_sampling_ratio/max": 1.8436402082443237, + "sampling/importance_sampling_ratio/mean": 1.0758095979690552, + "sampling/importance_sampling_ratio/min": 0.30199378728866577, + "sampling/sampling_logp_difference/max": 0.8697078227996826, + "sampling/sampling_logp_difference/mean": 0.029135018587112427, + "step": 154, + "step_time": 34.09276152199891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.363218754529953, + "epoch": 0.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3487781286239624, + "kl": 0.022380828857421875, + "learning_rate": 4.8010863349193605e-06, + "loss": -0.1792, + "num_tokens": 867974.0, + "reward": 0.2150000035762787, + "reward_std": 0.30277496576309204, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.4673328697681427, + "sampling/importance_sampling_ratio/max": 2.5031354427337646, + "sampling/importance_sampling_ratio/mean": 1.1692034006118774, + "sampling/importance_sampling_ratio/min": 0.5184096097946167, + "sampling/sampling_logp_difference/max": 0.4144449830055237, + "sampling/sampling_logp_difference/mean": 0.02268042229115963, + "step": 155, + "step_time": 34.44658254900423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.34199315309524536, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1296980381011963, + "kl": 0.03636087477207184, + "learning_rate": 4.797909224489531e-06, + "loss": 0.29, + "num_tokens": 873668.0, + "reward": 0.3400000035762787, + "reward_std": 0.55422043800354, + "rewards/reward_func/mean": 0.3400000035762787, + "rewards/reward_func/std": 0.5347362756729126, + "sampling/importance_sampling_ratio/max": 1.8167059421539307, + "sampling/importance_sampling_ratio/mean": 1.0690479278564453, + "sampling/importance_sampling_ratio/min": 0.3096846044063568, + "sampling/sampling_logp_difference/max": 1.1487369537353516, + "sampling/sampling_logp_difference/mean": 0.02510654367506504, + "step": 156, + "step_time": 43.02519494399894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.33920204639434814, + "epoch": 0.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9249754548072815, + "kl": 0.01282467320561409, + "learning_rate": 4.794708010079288e-06, + "loss": 0.1239, + "num_tokens": 879587.0, + "reward": 0.19749999046325684, + "reward_std": 0.5256129503250122, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.4881379008293152, + "sampling/importance_sampling_ratio/max": 2.3966870307922363, + "sampling/importance_sampling_ratio/mean": 1.0402398109436035, + "sampling/importance_sampling_ratio/min": 0.5133960843086243, + "sampling/sampling_logp_difference/max": 0.4196079969406128, + "sampling/sampling_logp_difference/mean": 0.023836011067032814, + "step": 157, + "step_time": 37.06165342399618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35729801654815674, + "epoch": 0.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6500968933105469, + "kl": 0.020701207220554352, + "learning_rate": 4.791482725267858e-06, + "loss": -0.3411, + "num_tokens": 884816.0, + "reward": 0.05625000596046448, + "reward_std": 0.299073725938797, + "rewards/reward_func/mean": 0.05625000596046448, + "rewards/reward_func/std": 0.38022318482398987, + "sampling/importance_sampling_ratio/max": 2.0143370628356934, + "sampling/importance_sampling_ratio/mean": 0.829423189163208, + "sampling/importance_sampling_ratio/min": 0.4333471953868866, + "sampling/sampling_logp_difference/max": 0.9994931221008301, + "sampling/sampling_logp_difference/mean": 0.03577464818954468, + "step": 158, + "step_time": 36.08289574600349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.39358967542648315, + "epoch": 0.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0583946704864502, + "kl": 0.027374830096960068, + "learning_rate": 4.78823340388695e-06, + "loss": -0.2827, + "num_tokens": 891153.0, + "reward": 0.20124998688697815, + "reward_std": 0.3190965950489044, + "rewards/reward_func/mean": 0.20124998688697815, + "rewards/reward_func/std": 0.46035507321357727, + "sampling/importance_sampling_ratio/max": 2.10949444770813, + "sampling/importance_sampling_ratio/mean": 1.0305712223052979, + "sampling/importance_sampling_ratio/min": 0.3240506649017334, + "sampling/sampling_logp_difference/max": 0.5563297271728516, + "sampling/sampling_logp_difference/mean": 0.025423740968108177, + "step": 159, + "step_time": 38.957578446003026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3258906602859497, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1462774276733398, + "kl": 0.03443189710378647, + "learning_rate": 4.7849600800204075e-06, + "loss": -0.1244, + "num_tokens": 896743.0, + "reward": 0.32124999165534973, + "reward_std": 0.30975341796875, + "rewards/reward_func/mean": 0.32124999165534973, + "rewards/reward_func/std": 0.5477600693702698, + "sampling/importance_sampling_ratio/max": 1.8812116384506226, + "sampling/importance_sampling_ratio/mean": 1.0209946632385254, + "sampling/importance_sampling_ratio/min": 0.4928432106971741, + "sampling/sampling_logp_difference/max": 1.028313159942627, + "sampling/sampling_logp_difference/mean": 0.025369469076395035, + "step": 160, + "step_time": 40.763587570007076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.2874404788017273, + "epoch": 0.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8741604089736938, + "kl": 0.036176808178424835, + "learning_rate": 4.781662788003851e-06, + "loss": 0.0124, + "num_tokens": 901634.0, + "reward": 0.33375000953674316, + "reward_std": 0.5386093854904175, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5257087349891663, + "sampling/importance_sampling_ratio/max": 1.1090807914733887, + "sampling/importance_sampling_ratio/mean": 0.8030825853347778, + "sampling/importance_sampling_ratio/min": 0.5435417294502258, + "sampling/sampling_logp_difference/max": 0.7872741222381592, + "sampling/sampling_logp_difference/mean": 0.02408299222588539, + "step": 161, + "step_time": 36.742873462993884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 41.875, + "completions/mean_terminated_length": 41.875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3158836364746094, + "epoch": 0.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0320552587509155, + "kl": 0.028184669092297554, + "learning_rate": 4.778341562424312e-06, + "loss": 0.0158, + "num_tokens": 906954.0, + "reward": 0.07124999910593033, + "reward_std": 0.2818355858325958, + "rewards/reward_func/mean": 0.07124999910593033, + "rewards/reward_func/std": 0.3733798563480377, + "sampling/importance_sampling_ratio/max": 1.5998609066009521, + "sampling/importance_sampling_ratio/mean": 0.9003580212593079, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9920358657836914, + "sampling/sampling_logp_difference/mean": 0.026213299483060837, + "step": 162, + "step_time": 36.50039826599823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.32623767852783203, + "epoch": 0.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6853413581848145, + "kl": 0.04150290787220001, + "learning_rate": 4.774996438119876e-06, + "loss": 0.0513, + "num_tokens": 912348.0, + "reward": 0.05125000327825546, + "reward_std": 0.2890905439853668, + "rewards/reward_func/mean": 0.05125000327825546, + "rewards/reward_func/std": 0.380429744720459, + "sampling/importance_sampling_ratio/max": 1.632591962814331, + "sampling/importance_sampling_ratio/mean": 0.929560661315918, + "sampling/importance_sampling_ratio/min": 0.5039329528808594, + "sampling/sampling_logp_difference/max": 1.2554044723510742, + "sampling/sampling_logp_difference/mean": 0.02818283624947071, + "step": 163, + "step_time": 37.48346809898794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.38629114627838135, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8776299357414246, + "kl": 0.01975177228450775, + "learning_rate": 4.771627450179315e-06, + "loss": -0.0369, + "num_tokens": 918373.0, + "reward": 0.0925000011920929, + "reward_std": 0.2773401141166687, + "rewards/reward_func/mean": 0.0925000011920929, + "rewards/reward_func/std": 0.36768582463264465, + "sampling/importance_sampling_ratio/max": 1.5313901901245117, + "sampling/importance_sampling_ratio/mean": 0.7385870218276978, + "sampling/importance_sampling_ratio/min": 0.15066924691200256, + "sampling/sampling_logp_difference/max": 1.0661274194717407, + "sampling/sampling_logp_difference/mean": 0.02726839855313301, + "step": 164, + "step_time": 41.31438583700219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.34316372871398926, + "epoch": 0.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3050645589828491, + "kl": 0.04379383847117424, + "learning_rate": 4.768234633941716e-06, + "loss": -0.2008, + "num_tokens": 924728.0, + "reward": 0.4787500202655792, + "reward_std": 0.033062152564525604, + "rewards/reward_func/mean": 0.4787500202655792, + "rewards/reward_func/std": 0.5450671911239624, + "sampling/importance_sampling_ratio/max": 1.647208333015442, + "sampling/importance_sampling_ratio/mean": 0.962598979473114, + "sampling/importance_sampling_ratio/min": 0.3506295382976532, + "sampling/sampling_logp_difference/max": 1.1214780807495117, + "sampling/sampling_logp_difference/mean": 0.024030229076743126, + "step": 165, + "step_time": 37.69243161600025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.32444441318511963, + "epoch": 0.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2222055196762085, + "kl": 0.01766137219965458, + "learning_rate": 4.764818024996117e-06, + "loss": -0.1451, + "num_tokens": 930789.0, + "reward": 0.20125000178813934, + "reward_std": 0.30519673228263855, + "rewards/reward_func/mean": 0.20125000178813934, + "rewards/reward_func/std": 0.47792521119117737, + "sampling/importance_sampling_ratio/max": 1.7913861274719238, + "sampling/importance_sampling_ratio/mean": 1.0549870729446411, + "sampling/importance_sampling_ratio/min": 0.6209827661514282, + "sampling/sampling_logp_difference/max": 0.5931665897369385, + "sampling/sampling_logp_difference/mean": 0.0224621519446373, + "step": 166, + "step_time": 44.832081535001635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 56.0, + "completions/mean_terminated_length": 56.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3322504758834839, + "epoch": 0.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0410302877426147, + "kl": 0.0115932896733284, + "learning_rate": 4.76137765918113e-06, + "loss": 0.1055, + "num_tokens": 935988.0, + "reward": -0.04249999672174454, + "reward_std": 0.042813271284103394, + "rewards/reward_func/mean": -0.04249999672174454, + "rewards/reward_func/std": 0.040620192885398865, + "sampling/importance_sampling_ratio/max": 1.8506327867507935, + "sampling/importance_sampling_ratio/mean": 0.9713721871376038, + "sampling/importance_sampling_ratio/min": 0.3454587757587433, + "sampling/sampling_logp_difference/max": 0.5269238948822021, + "sampling/sampling_logp_difference/mean": 0.023420769721269608, + "step": 167, + "step_time": 37.73256101299194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3636493682861328, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0732991695404053, + "kl": 0.02313004620373249, + "learning_rate": 4.757913572584564e-06, + "loss": 0.11, + "num_tokens": 941321.0, + "reward": 0.5987499952316284, + "reward_std": 0.549216091632843, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.5237621665000916, + "sampling/importance_sampling_ratio/max": 1.03546142578125, + "sampling/importance_sampling_ratio/mean": 0.8432279825210571, + "sampling/importance_sampling_ratio/min": 0.4068431258201599, + "sampling/sampling_logp_difference/max": 0.6616531014442444, + "sampling/sampling_logp_difference/mean": 0.024472419172525406, + "step": 168, + "step_time": 32.777487155995914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.35398632287979126, + "epoch": 0.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9844120144844055, + "kl": 0.021729620173573494, + "learning_rate": 4.754425801543047e-06, + "loss": 0.0676, + "num_tokens": 947289.0, + "reward": 0.08124999701976776, + "reward_std": 0.28951090574264526, + "rewards/reward_func/mean": 0.08124999701976776, + "rewards/reward_func/std": 0.37745150923728943, + "sampling/importance_sampling_ratio/max": 1.1652858257293701, + "sampling/importance_sampling_ratio/mean": 0.9127243757247925, + "sampling/importance_sampling_ratio/min": 0.5623902678489685, + "sampling/sampling_logp_difference/max": 0.7757892608642578, + "sampling/sampling_logp_difference/mean": 0.023509806022047997, + "step": 169, + "step_time": 49.02694328399957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3460179567337036, + "epoch": 0.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3282604217529297, + "kl": 0.020951174199581146, + "learning_rate": 4.750914382641647e-06, + "loss": -0.2501, + "num_tokens": 952689.0, + "reward": 0.4937500059604645, + "reward_std": 0.5788298845291138, + "rewards/reward_func/mean": 0.4937500059604645, + "rewards/reward_func/std": 0.5359088182449341, + "sampling/importance_sampling_ratio/max": 2.5567233562469482, + "sampling/importance_sampling_ratio/mean": 1.0252596139907837, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6747963428497314, + "sampling/sampling_logp_difference/mean": 0.031033912673592567, + "step": 170, + "step_time": 30.625086041996838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3202894926071167, + "epoch": 0.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.965639114379883, + "kl": 0.02088959515094757, + "learning_rate": 4.747379352713489e-06, + "loss": -0.0871, + "num_tokens": 958452.0, + "reward": 0.08999999612569809, + "reward_std": 0.2740509510040283, + "rewards/reward_func/mean": 0.08999999612569809, + "rewards/reward_func/std": 0.36851051449775696, + "sampling/importance_sampling_ratio/max": 2.276595115661621, + "sampling/importance_sampling_ratio/mean": 1.3557054996490479, + "sampling/importance_sampling_ratio/min": 0.7857190370559692, + "sampling/sampling_logp_difference/max": 0.6380100250244141, + "sampling/sampling_logp_difference/mean": 0.02812064066529274, + "step": 171, + "step_time": 45.271941720013274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.36473971605300903, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.925652027130127, + "kl": 0.057864509522914886, + "learning_rate": 4.743820748839362e-06, + "loss": 0.086, + "num_tokens": 963908.0, + "reward": 0.34375, + "reward_std": 0.2589721083641052, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5190633535385132, + "sampling/importance_sampling_ratio/max": 2.0226168632507324, + "sampling/importance_sampling_ratio/mean": 0.9894812703132629, + "sampling/importance_sampling_ratio/min": 0.14174574613571167, + "sampling/sampling_logp_difference/max": 1.6586623191833496, + "sampling/sampling_logp_difference/mean": 0.02724120020866394, + "step": 172, + "step_time": 35.92826945899287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.32610613107681274, + "epoch": 0.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7379596829414368, + "kl": 0.04068184643983841, + "learning_rate": 4.740238608347337e-06, + "loss": 0.0258, + "num_tokens": 969638.0, + "reward": 0.08125000447034836, + "reward_std": 0.28094959259033203, + "rewards/reward_func/mean": 0.08125000447034836, + "rewards/reward_func/std": 0.37357112765312195, + "sampling/importance_sampling_ratio/max": 1.0961157083511353, + "sampling/importance_sampling_ratio/mean": 0.7261965274810791, + "sampling/importance_sampling_ratio/min": 0.32102635502815247, + "sampling/sampling_logp_difference/max": 1.0670392513275146, + "sampling/sampling_logp_difference/mean": 0.026620227843523026, + "step": 173, + "step_time": 37.101911647987436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.310922771692276, + "epoch": 0.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.942238450050354, + "kl": 0.011856799945235252, + "learning_rate": 4.736632968812374e-06, + "loss": -0.0206, + "num_tokens": 974836.0, + "reward": 0.4750000238418579, + "reward_std": 0.6064904928207397, + "rewards/reward_func/mean": 0.4750000238418579, + "rewards/reward_func/std": 0.561553955078125, + "sampling/importance_sampling_ratio/max": 1.7288455963134766, + "sampling/importance_sampling_ratio/mean": 0.8856727480888367, + "sampling/importance_sampling_ratio/min": 0.38050374388694763, + "sampling/sampling_logp_difference/max": 1.0040206909179688, + "sampling/sampling_logp_difference/mean": 0.021355075761675835, + "step": 174, + "step_time": 28.776191511002253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.39204347133636475, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9643235206604004, + "kl": 0.020943326875567436, + "learning_rate": 4.733003868055923e-06, + "loss": -0.1772, + "num_tokens": 980946.0, + "reward": 0.19249999523162842, + "reward_std": 0.519625186920166, + "rewards/reward_func/mean": 0.19249999523162842, + "rewards/reward_func/std": 0.4812706410884857, + "sampling/importance_sampling_ratio/max": 1.807846188545227, + "sampling/importance_sampling_ratio/mean": 1.206834077835083, + "sampling/importance_sampling_ratio/min": 0.4919049143791199, + "sampling/sampling_logp_difference/max": 0.673508882522583, + "sampling/sampling_logp_difference/mean": 0.027322232723236084, + "step": 175, + "step_time": 44.86855116499646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3516603112220764, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0344626903533936, + "kl": 0.013177501037716866, + "learning_rate": 4.729351344145536e-06, + "loss": -0.0118, + "num_tokens": 986362.0, + "reward": 0.23125000298023224, + "reward_std": 0.30514803528785706, + "rewards/reward_func/mean": 0.23125000298023224, + "rewards/reward_func/std": 0.46887215971946716, + "sampling/importance_sampling_ratio/max": 1.2862433195114136, + "sampling/importance_sampling_ratio/mean": 0.8300943374633789, + "sampling/importance_sampling_ratio/min": 0.2423274964094162, + "sampling/sampling_logp_difference/max": 0.8146283626556396, + "sampling/sampling_logp_difference/mean": 0.028766807168722153, + "step": 176, + "step_time": 33.98946770199109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.30126285552978516, + "epoch": 0.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9718162417411804, + "kl": 0.047380391508340836, + "learning_rate": 4.725675435394461e-06, + "loss": -0.0925, + "num_tokens": 991859.0, + "reward": 0.3362500071525574, + "reward_std": 0.570709228515625, + "rewards/reward_func/mean": 0.3362500071525574, + "rewards/reward_func/std": 0.5503749251365662, + "sampling/importance_sampling_ratio/max": 1.8823354244232178, + "sampling/importance_sampling_ratio/mean": 0.8540709018707275, + "sampling/importance_sampling_ratio/min": 0.27412620186805725, + "sampling/sampling_logp_difference/max": 1.3906545639038086, + "sampling/sampling_logp_difference/mean": 0.02667493373155594, + "step": 177, + "step_time": 32.61802449199604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.30542075634002686, + "epoch": 0.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640173196792603, + "kl": 0.08134660124778748, + "learning_rate": 4.721976180361239e-06, + "loss": -0.0737, + "num_tokens": 996941.0, + "reward": 0.33249998092651367, + "reward_std": 0.547777533531189, + "rewards/reward_func/mean": 0.33249998092651367, + "rewards/reward_func/std": 0.5260295867919922, + "sampling/importance_sampling_ratio/max": 2.168469190597534, + "sampling/importance_sampling_ratio/mean": 0.9947339296340942, + "sampling/importance_sampling_ratio/min": 0.20307093858718872, + "sampling/sampling_logp_difference/max": 1.36405611038208, + "sampling/sampling_logp_difference/mean": 0.026516852900385857, + "step": 178, + "step_time": 34.52733155799797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.31798258423805237, + "epoch": 0.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9152070879936218, + "kl": 0.06630422919988632, + "learning_rate": 4.718253617849306e-06, + "loss": 0.0036, + "num_tokens": 1002934.0, + "reward": 0.20499999821186066, + "reward_std": 0.5118623971939087, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.47473302483558655, + "sampling/importance_sampling_ratio/max": 1.6232030391693115, + "sampling/importance_sampling_ratio/mean": 0.7850979566574097, + "sampling/importance_sampling_ratio/min": 0.14968660473823547, + "sampling/sampling_logp_difference/max": 0.8372163772583008, + "sampling/sampling_logp_difference/mean": 0.026052938774228096, + "step": 179, + "step_time": 43.08452248299727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.361512690782547, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9003314971923828, + "kl": 0.021023273468017578, + "learning_rate": 4.7145077869065815e-06, + "loss": -0.0662, + "num_tokens": 1008504.0, + "reward": 0.33375000953674316, + "reward_std": 0.569517970085144, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5475122332572937, + "sampling/importance_sampling_ratio/max": 1.3407214879989624, + "sampling/importance_sampling_ratio/mean": 0.9676664471626282, + "sampling/importance_sampling_ratio/min": 0.5192452073097229, + "sampling/sampling_logp_difference/max": 0.6442506909370422, + "sampling/sampling_logp_difference/mean": 0.026277683675289154, + "step": 180, + "step_time": 35.98104014099226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.348086416721344, + "epoch": 0.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.418018102645874, + "kl": 0.05438853055238724, + "learning_rate": 4.710738726825059e-06, + "loss": 0.3842, + "num_tokens": 1014483.0, + "reward": 0.3474999964237213, + "reward_std": 0.538013756275177, + "rewards/reward_func/mean": 0.3474999964237213, + "rewards/reward_func/std": 0.5186452269554138, + "sampling/importance_sampling_ratio/max": 2.6314172744750977, + "sampling/importance_sampling_ratio/mean": 1.0734989643096924, + "sampling/importance_sampling_ratio/min": 0.10073666274547577, + "sampling/sampling_logp_difference/max": 1.0612448453903198, + "sampling/sampling_logp_difference/mean": 0.027826346457004547, + "step": 181, + "step_time": 31.0331748949975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3266671299934387, + "epoch": 0.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.193312168121338, + "kl": 0.09049133956432343, + "learning_rate": 4.706946477140396e-06, + "loss": -0.1725, + "num_tokens": 1019583.0, + "reward": 0.46000003814697266, + "reward_std": 0.5269915461540222, + "rewards/reward_func/mean": 0.46000003814697266, + "rewards/reward_func/std": 0.55250084400177, + "sampling/importance_sampling_ratio/max": 1.7039434909820557, + "sampling/importance_sampling_ratio/mean": 1.0256106853485107, + "sampling/importance_sampling_ratio/min": 0.3618232309818268, + "sampling/sampling_logp_difference/max": 0.6861605644226074, + "sampling/sampling_logp_difference/mean": 0.02597002312541008, + "step": 182, + "step_time": 32.926690759995836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3671197295188904, + "epoch": 0.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.113112211227417, + "kl": 0.1199764832854271, + "learning_rate": 4.703131077631498e-06, + "loss": -0.1404, + "num_tokens": 1025042.0, + "reward": 0.08250000327825546, + "reward_std": 0.27332803606987, + "rewards/reward_func/mean": 0.08250000327825546, + "rewards/reward_func/std": 0.37297070026397705, + "sampling/importance_sampling_ratio/max": 1.334097146987915, + "sampling/importance_sampling_ratio/mean": 0.5801923274993896, + "sampling/importance_sampling_ratio/min": 0.1981949657201767, + "sampling/sampling_logp_difference/max": 1.5971126556396484, + "sampling/sampling_logp_difference/mean": 0.035717226564884186, + "step": 183, + "step_time": 34.52302819299803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3453419804573059, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4540451765060425, + "kl": 0.02787589468061924, + "learning_rate": 4.699292568320097e-06, + "loss": -0.1431, + "num_tokens": 1030316.0, + "reward": 0.20125000178813934, + "reward_std": 0.32598677277565, + "rewards/reward_func/mean": 0.20125000178813934, + "rewards/reward_func/std": 0.4764583110809326, + "sampling/importance_sampling_ratio/max": 2.2556746006011963, + "sampling/importance_sampling_ratio/mean": 1.255730390548706, + "sampling/importance_sampling_ratio/min": 0.7337526679039001, + "sampling/sampling_logp_difference/max": 0.6424871683120728, + "sampling/sampling_logp_difference/mean": 0.03114481456577778, + "step": 184, + "step_time": 40.47120292400359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3271094262599945, + "epoch": 0.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8317610025405884, + "kl": 0.015337463468313217, + "learning_rate": 4.6954309894703435e-06, + "loss": 0.132, + "num_tokens": 1035537.0, + "reward": 0.33500000834465027, + "reward_std": 0.571208119392395, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5463123321533203, + "sampling/importance_sampling_ratio/max": 1.6981804370880127, + "sampling/importance_sampling_ratio/mean": 0.7978900671005249, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.0647196769714355, + "sampling/sampling_logp_difference/mean": 0.025768490508198738, + "step": 185, + "step_time": 36.403069805994164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3013562560081482, + "epoch": 0.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3188611268997192, + "kl": 0.0544801689684391, + "learning_rate": 4.69154638158837e-06, + "loss": 0.3933, + "num_tokens": 1041100.0, + "reward": 0.06624999642372131, + "reward_std": 0.28185734152793884, + "rewards/reward_func/mean": 0.06624999642372131, + "rewards/reward_func/std": 0.3781132698059082, + "sampling/importance_sampling_ratio/max": 1.8907493352890015, + "sampling/importance_sampling_ratio/mean": 0.9408384561538696, + "sampling/importance_sampling_ratio/min": 0.3894776403903961, + "sampling/sampling_logp_difference/max": 0.856117844581604, + "sampling/sampling_logp_difference/mean": 0.027976665645837784, + "step": 186, + "step_time": 45.57103870699939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.39340823888778687, + "epoch": 0.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1754374504089355, + "kl": 0.020966093987226486, + "learning_rate": 4.687638785421875e-06, + "loss": -0.0169, + "num_tokens": 1048354.0, + "reward": 0.32375001907348633, + "reward_std": 0.5538316965103149, + "rewards/reward_func/mean": 0.32375001907348633, + "rewards/reward_func/std": 0.5343871712684631, + "sampling/importance_sampling_ratio/max": 2.006873846054077, + "sampling/importance_sampling_ratio/mean": 0.89984130859375, + "sampling/importance_sampling_ratio/min": 0.4372103810310364, + "sampling/sampling_logp_difference/max": 0.740842342376709, + "sampling/sampling_logp_difference/mean": 0.029992148280143738, + "step": 187, + "step_time": 50.629349813010776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3264714479446411, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4700989723205566, + "kl": 0.01745157688856125, + "learning_rate": 4.683708241959694e-06, + "loss": 0.3489, + "num_tokens": 1054088.0, + "reward": 0.5987499952316284, + "reward_std": 0.545418381690979, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.524034857749939, + "sampling/importance_sampling_ratio/max": 2.040449857711792, + "sampling/importance_sampling_ratio/mean": 1.0357085466384888, + "sampling/importance_sampling_ratio/min": 0.47571852803230286, + "sampling/sampling_logp_difference/max": 0.6641407012939453, + "sampling/sampling_logp_difference/mean": 0.027400383725762367, + "step": 188, + "step_time": 30.43117203100701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3622972071170807, + "epoch": 0.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.803978681564331, + "kl": 0.015578195452690125, + "learning_rate": 4.679754792431368e-06, + "loss": 0.0812, + "num_tokens": 1059205.0, + "reward": 0.4424999952316284, + "reward_std": 0.6157840490341187, + "rewards/reward_func/mean": 0.4424999952316284, + "rewards/reward_func/std": 0.5730806589126587, + "sampling/importance_sampling_ratio/max": 1.8119560480117798, + "sampling/importance_sampling_ratio/mean": 0.7570701837539673, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7887172698974609, + "sampling/sampling_logp_difference/mean": 0.023639146238565445, + "step": 189, + "step_time": 29.315127633002703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.36829447746276855, + "epoch": 0.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.311635136604309, + "kl": 0.03190737962722778, + "learning_rate": 4.675778478306712e-06, + "loss": 0.068, + "num_tokens": 1064888.0, + "reward": 0.0625000074505806, + "reward_std": 0.2896197438240051, + "rewards/reward_func/mean": 0.0625000074505806, + "rewards/reward_func/std": 0.3778038024902344, + "sampling/importance_sampling_ratio/max": 2.0182361602783203, + "sampling/importance_sampling_ratio/mean": 1.091983675956726, + "sampling/importance_sampling_ratio/min": 0.5430738925933838, + "sampling/sampling_logp_difference/max": 0.522942304611206, + "sampling/sampling_logp_difference/mean": 0.02682231366634369, + "step": 190, + "step_time": 38.76032414600195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.33380353450775146, + "epoch": 0.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2356117963790894, + "kl": 0.012882580980658531, + "learning_rate": 4.671779341295378e-06, + "loss": -0.0225, + "num_tokens": 1069880.0, + "reward": 0.4675000011920929, + "reward_std": 0.5203424692153931, + "rewards/reward_func/mean": 0.4675000011920929, + "rewards/reward_func/std": 0.548341691493988, + "sampling/importance_sampling_ratio/max": 1.7154066562652588, + "sampling/importance_sampling_ratio/mean": 1.2481818199157715, + "sampling/importance_sampling_ratio/min": 0.59772789478302, + "sampling/sampling_logp_difference/max": 0.43304991722106934, + "sampling/sampling_logp_difference/mean": 0.022705163806676865, + "step": 191, + "step_time": 34.24753138900269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3223872184753418, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1335781812667847, + "kl": 0.05066206306219101, + "learning_rate": 4.667757423346423e-06, + "loss": -0.2483, + "num_tokens": 1074868.0, + "reward": 0.32625001668930054, + "reward_std": 0.5714980363845825, + "rewards/reward_func/mean": 0.32625001668930054, + "rewards/reward_func/std": 0.5501672029495239, + "sampling/importance_sampling_ratio/max": 2.406949520111084, + "sampling/importance_sampling_ratio/mean": 1.1955797672271729, + "sampling/importance_sampling_ratio/min": 0.2920358180999756, + "sampling/sampling_logp_difference/max": 1.1931378841400146, + "sampling/sampling_logp_difference/mean": 0.02424553781747818, + "step": 192, + "step_time": 29.6137595110049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3726162314414978, + "epoch": 0.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8356767892837524, + "kl": 0.020604528486728668, + "learning_rate": 4.663712766647862e-06, + "loss": 0.143, + "num_tokens": 1081294.0, + "reward": 0.06499999761581421, + "reward_std": 0.3063734769821167, + "rewards/reward_func/mean": 0.06499999761581421, + "rewards/reward_func/std": 0.38172540068626404, + "sampling/importance_sampling_ratio/max": 1.441202163696289, + "sampling/importance_sampling_ratio/mean": 0.7192156314849854, + "sampling/importance_sampling_ratio/min": 0.4092276692390442, + "sampling/sampling_logp_difference/max": 0.7641327381134033, + "sampling/sampling_logp_difference/mean": 0.0280342735350132, + "step": 193, + "step_time": 40.439407575002406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.2863295376300812, + "epoch": 0.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9479438662528992, + "kl": 0.02047712728381157, + "learning_rate": 4.65964541362623e-06, + "loss": -0.1188, + "num_tokens": 1086752.0, + "reward": 0.33375000953674316, + "reward_std": 0.5453760623931885, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5247839689254761, + "sampling/importance_sampling_ratio/max": 1.8372682332992554, + "sampling/importance_sampling_ratio/mean": 0.9729915857315063, + "sampling/importance_sampling_ratio/min": 0.4610009491443634, + "sampling/sampling_logp_difference/max": 0.9207382202148438, + "sampling/sampling_logp_difference/mean": 0.022392306476831436, + "step": 194, + "step_time": 35.24215532100061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.32116204500198364, + "epoch": 0.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1504021883010864, + "kl": 0.0485202930867672, + "learning_rate": 4.655555406946135e-06, + "loss": 0.0483, + "num_tokens": 1091989.0, + "reward": 0.19874998927116394, + "reward_std": 0.5272426605224609, + "rewards/reward_func/mean": 0.19874998927116394, + "rewards/reward_func/std": 0.4883042871952057, + "sampling/importance_sampling_ratio/max": 2.2885639667510986, + "sampling/importance_sampling_ratio/mean": 0.93783038854599, + "sampling/importance_sampling_ratio/min": 0.25737592577934265, + "sampling/sampling_logp_difference/max": 0.6654841899871826, + "sampling/sampling_logp_difference/mean": 0.022658096626400948, + "step": 195, + "step_time": 32.107545826991554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.32712772488594055, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.951930820941925, + "kl": 0.027490120381116867, + "learning_rate": 4.651442789509813e-06, + "loss": -0.1811, + "num_tokens": 1097346.0, + "reward": 0.08249999582767487, + "reward_std": 0.2915714383125305, + "rewards/reward_func/mean": 0.08249999582767487, + "rewards/reward_func/std": 0.373430073261261, + "sampling/importance_sampling_ratio/max": 1.5322624444961548, + "sampling/importance_sampling_ratio/mean": 0.8581938743591309, + "sampling/importance_sampling_ratio/min": 0.5205777883529663, + "sampling/sampling_logp_difference/max": 0.3183736801147461, + "sampling/sampling_logp_difference/mean": 0.018254410475492477, + "step": 196, + "step_time": 35.83734907700273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.39830613136291504, + "epoch": 0.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7966833114624023, + "kl": 0.026175260543823242, + "learning_rate": 4.647307604456675e-06, + "loss": -0.195, + "num_tokens": 1103633.0, + "reward": 0.20124998688697815, + "reward_std": 0.5211325883865356, + "rewards/reward_func/mean": 0.20124998688697815, + "rewards/reward_func/std": 0.48300954699516296, + "sampling/importance_sampling_ratio/max": 1.6678045988082886, + "sampling/importance_sampling_ratio/mean": 0.7011741399765015, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.3222107887268066, + "sampling/sampling_logp_difference/mean": 0.031116489320993423, + "step": 197, + "step_time": 42.751525571002276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3397493362426758, + "epoch": 0.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7519938945770264, + "kl": 0.02400045096874237, + "learning_rate": 4.643149895162854e-06, + "loss": -0.0213, + "num_tokens": 1108925.0, + "reward": 0.05000000447034836, + "reward_std": 0.28527507185935974, + "rewards/reward_func/mean": 0.05000000447034836, + "rewards/reward_func/std": 0.38455912470817566, + "sampling/importance_sampling_ratio/max": 2.140333890914917, + "sampling/importance_sampling_ratio/mean": 1.1589810848236084, + "sampling/importance_sampling_ratio/min": 0.36448001861572266, + "sampling/sampling_logp_difference/max": 0.5630743503570557, + "sampling/sampling_logp_difference/mean": 0.02603762596845627, + "step": 198, + "step_time": 34.99613412701001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3567110598087311, + "epoch": 0.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1194331645965576, + "kl": 0.018017902970314026, + "learning_rate": 4.6389697052407535e-06, + "loss": -0.1716, + "num_tokens": 1114668.0, + "reward": 0.05625000596046448, + "reward_std": 0.29238367080688477, + "rewards/reward_func/mean": 0.05625000596046448, + "rewards/reward_func/std": 0.36862435936927795, + "sampling/importance_sampling_ratio/max": 1.573565125465393, + "sampling/importance_sampling_ratio/mean": 1.1634211540222168, + "sampling/importance_sampling_ratio/min": 0.45238780975341797, + "sampling/sampling_logp_difference/max": 0.3439953327178955, + "sampling/sampling_logp_difference/mean": 0.020245444029569626, + "step": 199, + "step_time": 37.81699361599749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 53.125, + "completions/mean_terminated_length": 53.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.33191120624542236, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3887816667556763, + "kl": 0.012048996984958649, + "learning_rate": 4.634767078538589e-06, + "loss": -0.3053, + "num_tokens": 1120285.0, + "reward": 0.16124999523162842, + "reward_std": 0.5615929365158081, + "rewards/reward_func/mean": 0.16124999523162842, + "rewards/reward_func/std": 0.5200669765472412, + "sampling/importance_sampling_ratio/max": 2.2076382637023926, + "sampling/importance_sampling_ratio/mean": 0.9878512620925903, + "sampling/importance_sampling_ratio/min": 0.5573294162750244, + "sampling/sampling_logp_difference/max": 0.4176292419433594, + "sampling/sampling_logp_difference/mean": 0.025101102888584137, + "step": 200, + "step_time": 40.36116412401316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3108067512512207, + "epoch": 0.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.10872745513916, + "kl": 0.01511746272444725, + "learning_rate": 4.630542059139923e-06, + "loss": 0.1775, + "num_tokens": 1125744.0, + "reward": 0.06999999284744263, + "reward_std": 0.28674817085266113, + "rewards/reward_func/mean": 0.06999999284744263, + "rewards/reward_func/std": 0.3770941495895386, + "sampling/importance_sampling_ratio/max": 1.9700719118118286, + "sampling/importance_sampling_ratio/mean": 1.2726753950119019, + "sampling/importance_sampling_ratio/min": 0.38142406940460205, + "sampling/sampling_logp_difference/max": 0.5226891040802002, + "sampling/sampling_logp_difference/mean": 0.024665817618370056, + "step": 201, + "step_time": 40.36325747499359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.3040258288383484, + "epoch": 0.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3096510171890259, + "kl": 0.024469975382089615, + "learning_rate": 4.626294691363213e-06, + "loss": -0.0912, + "num_tokens": 1131445.0, + "reward": 0.16249999403953552, + "reward_std": 0.5160882472991943, + "rewards/reward_func/mean": 0.16249999403953552, + "rewards/reward_func/std": 0.47918832302093506, + "sampling/importance_sampling_ratio/max": 2.0163538455963135, + "sampling/importance_sampling_ratio/mean": 1.1956194639205933, + "sampling/importance_sampling_ratio/min": 0.6150079369544983, + "sampling/sampling_logp_difference/max": 0.5746273994445801, + "sampling/sampling_logp_difference/mean": 0.020709635689854622, + "step": 202, + "step_time": 47.36363880299905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3916836977005005, + "epoch": 0.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0519437789916992, + "kl": 0.02574199251830578, + "learning_rate": 4.622025019761336e-06, + "loss": 0.1733, + "num_tokens": 1136838.0, + "reward": 0.32750001549720764, + "reward_std": 0.5597944259643555, + "rewards/reward_func/mean": 0.32750001549720764, + "rewards/reward_func/std": 0.5453636050224304, + "sampling/importance_sampling_ratio/max": 1.9452093839645386, + "sampling/importance_sampling_ratio/mean": 1.0770931243896484, + "sampling/importance_sampling_ratio/min": 0.3852846026420593, + "sampling/sampling_logp_difference/max": 0.5705301761627197, + "sampling/sampling_logp_difference/mean": 0.027909845113754272, + "step": 203, + "step_time": 33.44002888299292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.36012089252471924, + "epoch": 0.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2741502523422241, + "kl": 0.014535860158503056, + "learning_rate": 4.617733089121127e-06, + "loss": 0.0169, + "num_tokens": 1141938.0, + "reward": 0.21375000476837158, + "reward_std": 0.32701846957206726, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.48091989755630493, + "sampling/importance_sampling_ratio/max": 1.4291470050811768, + "sampling/importance_sampling_ratio/mean": 0.9600297212600708, + "sampling/importance_sampling_ratio/min": 0.23359182476997375, + "sampling/sampling_logp_difference/max": 0.5247984528541565, + "sampling/sampling_logp_difference/mean": 0.02588305063545704, + "step": 204, + "step_time": 34.81417223700555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.625, + "completions/mean_terminated_length": 46.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3439635932445526, + "epoch": 0.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9802369475364685, + "kl": 0.047367069870233536, + "learning_rate": 4.613418944462907e-06, + "loss": 0.1454, + "num_tokens": 1147427.0, + "reward": 0.19749999046325684, + "reward_std": 0.5340633392333984, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.49505412578582764, + "sampling/importance_sampling_ratio/max": 1.7386454343795776, + "sampling/importance_sampling_ratio/mean": 0.8837832808494568, + "sampling/importance_sampling_ratio/min": 0.38633158802986145, + "sampling/sampling_logp_difference/max": 0.8758201599121094, + "sampling/sampling_logp_difference/mean": 0.03084520995616913, + "step": 205, + "step_time": 34.74832553599845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3894272446632385, + "epoch": 0.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4726861715316772, + "kl": 0.014239441603422165, + "learning_rate": 4.609082631040012e-06, + "loss": -0.0745, + "num_tokens": 1152633.0, + "reward": 0.20000000298023224, + "reward_std": 0.5340527892112732, + "rewards/reward_func/mean": 0.20000000298023224, + "rewards/reward_func/std": 0.4956093430519104, + "sampling/importance_sampling_ratio/max": 1.6325794458389282, + "sampling/importance_sampling_ratio/mean": 0.9623055458068848, + "sampling/importance_sampling_ratio/min": 0.37618598341941833, + "sampling/sampling_logp_difference/max": 0.5434499979019165, + "sampling/sampling_logp_difference/mean": 0.027271058410406113, + "step": 206, + "step_time": 33.83777969199582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.37837862968444824, + "epoch": 0.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0193114280700684, + "kl": 0.02881534770131111, + "learning_rate": 4.604724194338318e-06, + "loss": 0.2341, + "num_tokens": 1157895.0, + "reward": -0.05375000089406967, + "reward_std": 0.03522847592830658, + "rewards/reward_func/mean": -0.05375000089406967, + "rewards/reward_func/std": 0.03335416316986084, + "sampling/importance_sampling_ratio/max": 2.33683180809021, + "sampling/importance_sampling_ratio/mean": 1.0109095573425293, + "sampling/importance_sampling_ratio/min": 0.38880789279937744, + "sampling/sampling_logp_difference/max": 0.8555021286010742, + "sampling/sampling_logp_difference/mean": 0.030559619888663292, + "step": 207, + "step_time": 42.82284573499055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.30469053983688354, + "epoch": 0.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4579194784164429, + "kl": 0.02601119875907898, + "learning_rate": 4.600343680075764e-06, + "loss": 0.0158, + "num_tokens": 1163475.0, + "reward": 0.4625000059604645, + "reward_std": 0.03025972843170166, + "rewards/reward_func/mean": 0.4625000059604645, + "rewards/reward_func/std": 0.559508740901947, + "sampling/importance_sampling_ratio/max": 2.2815041542053223, + "sampling/importance_sampling_ratio/mean": 1.3477814197540283, + "sampling/importance_sampling_ratio/min": 0.47539442777633667, + "sampling/sampling_logp_difference/max": 0.38339972496032715, + "sampling/sampling_logp_difference/mean": 0.01848520152270794, + "step": 208, + "step_time": 29.89221857400844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.33555877208709717, + "epoch": 0.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3514803647994995, + "kl": 0.015731051564216614, + "learning_rate": 4.5959411342018715e-06, + "loss": -0.0843, + "num_tokens": 1168580.0, + "reward": 0.5887500047683716, + "reward_std": 0.5457277894020081, + "rewards/reward_func/mean": 0.5887500047683716, + "rewards/reward_func/std": 0.5272689461708069, + "sampling/importance_sampling_ratio/max": 1.7015126943588257, + "sampling/importance_sampling_ratio/mean": 1.1384941339492798, + "sampling/importance_sampling_ratio/min": 0.5749424695968628, + "sampling/sampling_logp_difference/max": 0.4752795696258545, + "sampling/sampling_logp_difference/mean": 0.024254605174064636, + "step": 209, + "step_time": 27.062604751001345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.36218172311782837, + "epoch": 0.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8917838335037231, + "kl": 0.012122070416808128, + "learning_rate": 4.591516602897263e-06, + "loss": -0.0551, + "num_tokens": 1174312.0, + "reward": 0.17500001192092896, + "reward_std": 0.5405874252319336, + "rewards/reward_func/mean": 0.17500001192092896, + "rewards/reward_func/std": 0.5007423162460327, + "sampling/importance_sampling_ratio/max": 1.3169264793395996, + "sampling/importance_sampling_ratio/mean": 0.9016479849815369, + "sampling/importance_sampling_ratio/min": 0.5373066663742065, + "sampling/sampling_logp_difference/max": 0.4783933162689209, + "sampling/sampling_logp_difference/mean": 0.021929645910859108, + "step": 210, + "step_time": 43.65166889700049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34002870321273804, + "epoch": 0.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7652380466461182, + "kl": 0.0239486712962389, + "learning_rate": 4.587070132573178e-06, + "loss": 0.2205, + "num_tokens": 1180592.0, + "reward": 0.19624999165534973, + "reward_std": 0.5029024481773376, + "rewards/reward_func/mean": 0.19624999165534973, + "rewards/reward_func/std": 0.46650490164756775, + "sampling/importance_sampling_ratio/max": 1.5237053632736206, + "sampling/importance_sampling_ratio/mean": 0.8609182238578796, + "sampling/importance_sampling_ratio/min": 0.4023110866546631, + "sampling/sampling_logp_difference/max": 0.5878002643585205, + "sampling/sampling_logp_difference/mean": 0.0249373409897089, + "step": 211, + "step_time": 39.88526947100763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 59.75, + "completions/mean_terminated_length": 59.75, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.34226059913635254, + "epoch": 0.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0378315448760986, + "kl": 0.01290669571608305, + "learning_rate": 4.582601769870988e-06, + "loss": 0.3469, + "num_tokens": 1185853.0, + "reward": 0.1850000023841858, + "reward_std": 0.5251417756080627, + "rewards/reward_func/mean": 0.1850000023841858, + "rewards/reward_func/std": 0.4867970943450928, + "sampling/importance_sampling_ratio/max": 2.886341094970703, + "sampling/importance_sampling_ratio/mean": 1.2305059432983398, + "sampling/importance_sampling_ratio/min": 0.39663147926330566, + "sampling/sampling_logp_difference/max": 0.7663023471832275, + "sampling/sampling_logp_difference/mean": 0.022634129971265793, + "step": 212, + "step_time": 34.38220458901196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3151271939277649, + "epoch": 0.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8756702542304993, + "kl": 0.03317102789878845, + "learning_rate": 4.578111561661702e-06, + "loss": 0.167, + "num_tokens": 1191085.0, + "reward": 0.19499999284744263, + "reward_std": 0.5074241161346436, + "rewards/reward_func/mean": 0.19499999284744263, + "rewards/reward_func/std": 0.4701063930988312, + "sampling/importance_sampling_ratio/max": 1.702741026878357, + "sampling/importance_sampling_ratio/mean": 0.949587881565094, + "sampling/importance_sampling_ratio/min": 0.2907824218273163, + "sampling/sampling_logp_difference/max": 0.7317852973937988, + "sampling/sampling_logp_difference/mean": 0.023712046444416046, + "step": 213, + "step_time": 32.26193818300089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.4092119336128235, + "epoch": 0.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1255816221237183, + "kl": 0.019677717238664627, + "learning_rate": 4.57359955504548e-06, + "loss": 0.1728, + "num_tokens": 1196632.0, + "reward": 0.20499999821186066, + "reward_std": 0.526760458946228, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.4881451725959778, + "sampling/importance_sampling_ratio/max": 2.468197822570801, + "sampling/importance_sampling_ratio/mean": 1.0661287307739258, + "sampling/importance_sampling_ratio/min": 0.45330435037612915, + "sampling/sampling_logp_difference/max": 0.42670774459838867, + "sampling/sampling_logp_difference/mean": 0.029793616384267807, + "step": 214, + "step_time": 35.49122104200069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.37527209520339966, + "epoch": 0.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.897345781326294, + "kl": 0.02400030940771103, + "learning_rate": 4.569065797351135e-06, + "loss": 0.2748, + "num_tokens": 1202705.0, + "reward": 0.2224999964237213, + "reward_std": 0.5121018886566162, + "rewards/reward_func/mean": 0.2224999964237213, + "rewards/reward_func/std": 0.47415342926979065, + "sampling/importance_sampling_ratio/max": 1.8278297185897827, + "sampling/importance_sampling_ratio/mean": 1.0692496299743652, + "sampling/importance_sampling_ratio/min": 0.609551191329956, + "sampling/sampling_logp_difference/max": 0.5747532844543457, + "sampling/sampling_logp_difference/mean": 0.027012761682271957, + "step": 215, + "step_time": 37.0583976469934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.32061967253685, + "epoch": 0.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4469271898269653, + "kl": 0.014168168418109417, + "learning_rate": 4.564510336135642e-06, + "loss": 0.0253, + "num_tokens": 1208393.0, + "reward": 0.3487499952316284, + "reward_std": 0.5584208965301514, + "rewards/reward_func/mean": 0.3487499952316284, + "rewards/reward_func/std": 0.539958655834198, + "sampling/importance_sampling_ratio/max": 1.755660891532898, + "sampling/importance_sampling_ratio/mean": 0.914620041847229, + "sampling/importance_sampling_ratio/min": 0.2794102728366852, + "sampling/sampling_logp_difference/max": 0.5640288591384888, + "sampling/sampling_logp_difference/mean": 0.026515036821365356, + "step": 216, + "step_time": 30.74552646500524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 45.125, + "completions/mean_terminated_length": 45.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3240736722946167, + "epoch": 0.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9326397776603699, + "kl": 0.03814445063471794, + "learning_rate": 4.559933219183631e-06, + "loss": 0.2147, + "num_tokens": 1213972.0, + "reward": 0.21375000476837158, + "reward_std": 0.5260501503944397, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.4870299696922302, + "sampling/importance_sampling_ratio/max": 2.4522364139556885, + "sampling/importance_sampling_ratio/mean": 0.9913116693496704, + "sampling/importance_sampling_ratio/min": 0.1499728411436081, + "sampling/sampling_logp_difference/max": 1.3662075996398926, + "sampling/sampling_logp_difference/mean": 0.03047903999686241, + "step": 217, + "step_time": 30.938302820009994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3097589612007141, + "epoch": 0.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0612341165542603, + "kl": 0.014435656368732452, + "learning_rate": 4.555334494506895e-06, + "loss": 0.0446, + "num_tokens": 1219147.0, + "reward": 0.33249998092651367, + "reward_std": 0.5609448552131653, + "rewards/reward_func/mean": 0.33249998092651367, + "rewards/reward_func/std": 0.5397817492485046, + "sampling/importance_sampling_ratio/max": 1.438905119895935, + "sampling/importance_sampling_ratio/mean": 0.8124980926513672, + "sampling/importance_sampling_ratio/min": 0.436778724193573, + "sampling/sampling_logp_difference/max": 0.38486456871032715, + "sampling/sampling_logp_difference/mean": 0.024791575968265533, + "step": 218, + "step_time": 33.168488173992955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3692587614059448, + "epoch": 0.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0102483034133911, + "kl": 0.06603902578353882, + "learning_rate": 4.550714210343879e-06, + "loss": -0.1641, + "num_tokens": 1224789.0, + "reward": 0.17499999701976776, + "reward_std": 0.32527798414230347, + "rewards/reward_func/mean": 0.17499999701976776, + "rewards/reward_func/std": 0.47343727946281433, + "sampling/importance_sampling_ratio/max": 1.1977847814559937, + "sampling/importance_sampling_ratio/mean": 0.8777039051055908, + "sampling/importance_sampling_ratio/min": 0.13160440325737, + "sampling/sampling_logp_difference/max": 1.4867031574249268, + "sampling/sampling_logp_difference/mean": 0.02460072562098503, + "step": 219, + "step_time": 42.85501969201141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3599325716495514, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6400805711746216, + "kl": 0.010766595602035522, + "learning_rate": 4.546072415159179e-06, + "loss": -0.3985, + "num_tokens": 1230387.0, + "reward": 0.6000000238418579, + "reward_std": 0.5655125379562378, + "rewards/reward_func/mean": 0.6000000238418579, + "rewards/reward_func/std": 0.543901264667511, + "sampling/importance_sampling_ratio/max": 2.3945584297180176, + "sampling/importance_sampling_ratio/mean": 1.2363660335540771, + "sampling/importance_sampling_ratio/min": 0.5109630227088928, + "sampling/sampling_logp_difference/max": 0.43884068727493286, + "sampling/sampling_logp_difference/mean": 0.027709752321243286, + "step": 220, + "step_time": 31.669988991998252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.37169408798217773, + "epoch": 0.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.099573016166687, + "kl": 0.021356448531150818, + "learning_rate": 4.541409157643027e-06, + "loss": 0.2744, + "num_tokens": 1235517.0, + "reward": 0.07499999552965164, + "reward_std": 0.2952970862388611, + "rewards/reward_func/mean": 0.07499999552965164, + "rewards/reward_func/std": 0.37305688858032227, + "sampling/importance_sampling_ratio/max": 1.954587459564209, + "sampling/importance_sampling_ratio/mean": 0.9145734906196594, + "sampling/importance_sampling_ratio/min": 0.13262362778186798, + "sampling/sampling_logp_difference/max": 0.8386803865432739, + "sampling/sampling_logp_difference/mean": 0.031054425984621048, + "step": 221, + "step_time": 38.17915139699471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.38324177265167236, + "epoch": 0.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4401034116744995, + "kl": 0.012282771989703178, + "learning_rate": 4.5367244867107905e-06, + "loss": -0.0521, + "num_tokens": 1240753.0, + "reward": 0.19875000417232513, + "reward_std": 0.5245675444602966, + "rewards/reward_func/mean": 0.19875000417232513, + "rewards/reward_func/std": 0.48572295904159546, + "sampling/importance_sampling_ratio/max": 1.123025894165039, + "sampling/importance_sampling_ratio/mean": 0.8605712652206421, + "sampling/importance_sampling_ratio/min": 0.6056551933288574, + "sampling/sampling_logp_difference/max": 0.31289446353912354, + "sampling/sampling_logp_difference/mean": 0.02160695195198059, + "step": 222, + "step_time": 34.58787501300685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.38685908913612366, + "epoch": 0.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5898383855819702, + "kl": 0.008172457106411457, + "learning_rate": 4.53201845150245e-06, + "loss": -0.1864, + "num_tokens": 1246752.0, + "reward": 0.06750000268220901, + "reward_std": 0.3026241660118103, + "rewards/reward_func/mean": 0.06750000268220901, + "rewards/reward_func/std": 0.37594643235206604, + "sampling/importance_sampling_ratio/max": 1.4219330549240112, + "sampling/importance_sampling_ratio/mean": 0.9449641704559326, + "sampling/importance_sampling_ratio/min": 0.49323397874832153, + "sampling/sampling_logp_difference/max": 0.4340834617614746, + "sampling/sampling_logp_difference/mean": 0.02197321318089962, + "step": 223, + "step_time": 43.65093676799734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 55.875, + "completions/mean_terminated_length": 55.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3796464502811432, + "epoch": 0.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.050976276397705, + "kl": 0.013385515660047531, + "learning_rate": 4.527291101382088e-06, + "loss": -0.0046, + "num_tokens": 1252131.0, + "reward": 0.32124999165534973, + "reward_std": 0.5853705406188965, + "rewards/reward_func/mean": 0.32124999165534973, + "rewards/reward_func/std": 0.5598070621490479, + "sampling/importance_sampling_ratio/max": 1.8649810552597046, + "sampling/importance_sampling_ratio/mean": 0.9387291669845581, + "sampling/importance_sampling_ratio/min": 0.3916337490081787, + "sampling/sampling_logp_difference/max": 0.418468713760376, + "sampling/sampling_logp_difference/mean": 0.02320447750389576, + "step": 224, + "step_time": 25.26183516200399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.32041484117507935, + "epoch": 0.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1802082061767578, + "kl": 0.01214616559445858, + "learning_rate": 4.522542485937369e-06, + "loss": -0.0569, + "num_tokens": 1257710.0, + "reward": 0.29124999046325684, + "reward_std": 0.5801128149032593, + "rewards/reward_func/mean": 0.29124999046325684, + "rewards/reward_func/std": 0.5516064167022705, + "sampling/importance_sampling_ratio/max": 1.574925422668457, + "sampling/importance_sampling_ratio/mean": 1.1270374059677124, + "sampling/importance_sampling_ratio/min": 0.7811345458030701, + "sampling/sampling_logp_difference/max": 0.3047366142272949, + "sampling/sampling_logp_difference/mean": 0.018625199794769287, + "step": 225, + "step_time": 34.37411819701083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3748812973499298, + "epoch": 0.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2252881526947021, + "kl": 0.05437376722693443, + "learning_rate": 4.517772654979024e-06, + "loss": 0.0628, + "num_tokens": 1263716.0, + "reward": 0.0925000011920929, + "reward_std": 0.2703875005245209, + "rewards/reward_func/mean": 0.0925000011920929, + "rewards/reward_func/std": 0.35591936111450195, + "sampling/importance_sampling_ratio/max": 1.9437322616577148, + "sampling/importance_sampling_ratio/mean": 0.9380783438682556, + "sampling/importance_sampling_ratio/min": 0.49177488684654236, + "sampling/sampling_logp_difference/max": 0.42700815200805664, + "sampling/sampling_logp_difference/mean": 0.027065452188253403, + "step": 226, + "step_time": 42.97744282700296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.34751734137535095, + "epoch": 0.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7339047193527222, + "kl": 0.019938740879297256, + "learning_rate": 4.512981658540321e-06, + "loss": 0.0353, + "num_tokens": 1269133.0, + "reward": 0.45624998211860657, + "reward_std": 0.5175138711929321, + "rewards/reward_func/mean": 0.45624998211860657, + "rewards/reward_func/std": 0.5480859875679016, + "sampling/importance_sampling_ratio/max": 1.4849156141281128, + "sampling/importance_sampling_ratio/mean": 0.8701044321060181, + "sampling/importance_sampling_ratio/min": 0.37403663992881775, + "sampling/sampling_logp_difference/max": 0.3767660856246948, + "sampling/sampling_logp_difference/mean": 0.024450505152344704, + "step": 227, + "step_time": 36.81575430399971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 45.0, + "completions/mean_terminated_length": 45.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.31638604402542114, + "epoch": 0.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3729960918426514, + "kl": 0.022523336112499237, + "learning_rate": 4.508169546876547e-06, + "loss": 0.1049, + "num_tokens": 1274836.0, + "reward": 0.08625000715255737, + "reward_std": 0.27319854497909546, + "rewards/reward_func/mean": 0.08625000715255737, + "rewards/reward_func/std": 0.35824722051620483, + "sampling/importance_sampling_ratio/max": 1.3125501871109009, + "sampling/importance_sampling_ratio/mean": 0.9860361814498901, + "sampling/importance_sampling_ratio/min": 0.7861380577087402, + "sampling/sampling_logp_difference/max": 0.4707716703414917, + "sampling/sampling_logp_difference/mean": 0.021509695798158646, + "step": 228, + "step_time": 37.25821232999442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.31023848056793213, + "epoch": 0.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1774016618728638, + "kl": 0.011443953029811382, + "learning_rate": 4.503336370464476e-06, + "loss": -0.1194, + "num_tokens": 1280640.0, + "reward": 0.08624999225139618, + "reward_std": 0.28077811002731323, + "rewards/reward_func/mean": 0.08624999225139618, + "rewards/reward_func/std": 0.3710193634033203, + "sampling/importance_sampling_ratio/max": 1.4775018692016602, + "sampling/importance_sampling_ratio/mean": 1.0044816732406616, + "sampling/importance_sampling_ratio/min": 0.7999988794326782, + "sampling/sampling_logp_difference/max": 0.5553348064422607, + "sampling/sampling_logp_difference/mean": 0.019959062337875366, + "step": 229, + "step_time": 38.90176290499221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.39089083671569824, + "epoch": 0.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.016448974609375, + "kl": 0.01295425184071064, + "learning_rate": 4.49848218000184e-06, + "loss": -0.1282, + "num_tokens": 1286908.0, + "reward": 0.07874999940395355, + "reward_std": 0.2610425353050232, + "rewards/reward_func/mean": 0.07874999940395355, + "rewards/reward_func/std": 0.3425717055797577, + "sampling/importance_sampling_ratio/max": 2.0288777351379395, + "sampling/importance_sampling_ratio/mean": 0.9484986066818237, + "sampling/importance_sampling_ratio/min": 0.4408459961414337, + "sampling/sampling_logp_difference/max": 0.4210505485534668, + "sampling/sampling_logp_difference/mean": 0.02616976760327816, + "step": 230, + "step_time": 46.000103306010715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3809959292411804, + "epoch": 0.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7100056409835815, + "kl": 0.0341075174510479, + "learning_rate": 4.493607026406802e-06, + "loss": 0.1388, + "num_tokens": 1291925.0, + "reward": 0.45249998569488525, + "reward_std": 0.5185065269470215, + "rewards/reward_func/mean": 0.45249998569488525, + "rewards/reward_func/std": 0.5677461624145508, + "sampling/importance_sampling_ratio/max": 1.71293306350708, + "sampling/importance_sampling_ratio/mean": 1.0076119899749756, + "sampling/importance_sampling_ratio/min": 0.2770913541316986, + "sampling/sampling_logp_difference/max": 0.845590353012085, + "sampling/sampling_logp_difference/mean": 0.028274953365325928, + "step": 231, + "step_time": 31.11168534900935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3521654009819031, + "epoch": 0.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1459904909133911, + "kl": 0.014776136726140976, + "learning_rate": 4.488710960817416e-06, + "loss": 0.2149, + "num_tokens": 1297601.0, + "reward": 0.20250000059604645, + "reward_std": 0.31824785470962524, + "rewards/reward_func/mean": 0.20250000059604645, + "rewards/reward_func/std": 0.4914047420024872, + "sampling/importance_sampling_ratio/max": 1.8615068197250366, + "sampling/importance_sampling_ratio/mean": 0.9899470806121826, + "sampling/importance_sampling_ratio/min": 0.4704228937625885, + "sampling/sampling_logp_difference/max": 0.34760284423828125, + "sampling/sampling_logp_difference/mean": 0.02256855182349682, + "step": 232, + "step_time": 38.94866470299894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3941933214664459, + "epoch": 0.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7119557857513428, + "kl": 0.02081628516316414, + "learning_rate": 4.483794034591092e-06, + "loss": -0.0682, + "num_tokens": 1302788.0, + "reward": 0.3425000011920929, + "reward_std": 0.5602338910102844, + "rewards/reward_func/mean": 0.3425000011920929, + "rewards/reward_func/std": 0.5391727089881897, + "sampling/importance_sampling_ratio/max": 2.1718764305114746, + "sampling/importance_sampling_ratio/mean": 1.3146883249282837, + "sampling/importance_sampling_ratio/min": 0.7097762823104858, + "sampling/sampling_logp_difference/max": 1.0572991371154785, + "sampling/sampling_logp_difference/mean": 0.030411353334784508, + "step": 233, + "step_time": 28.17538536399661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.3587740659713745, + "epoch": 0.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6080266237258911, + "kl": 0.015071025118231773, + "learning_rate": 4.4788562993040615e-06, + "loss": -0.037, + "num_tokens": 1308217.0, + "reward": 0.21000000834465027, + "reward_std": 0.5147542953491211, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.47683480381965637, + "sampling/importance_sampling_ratio/max": 2.2702443599700928, + "sampling/importance_sampling_ratio/mean": 1.0888086557388306, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.2313389778137207, + "sampling/sampling_logp_difference/mean": 0.027467992156744003, + "step": 234, + "step_time": 33.06014819799748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.388597309589386, + "epoch": 0.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2065192461013794, + "kl": 0.012199870310723782, + "learning_rate": 4.473897806750829e-06, + "loss": 0.0499, + "num_tokens": 1313978.0, + "reward": 0.32625001668930054, + "reward_std": 0.5687720775604248, + "rewards/reward_func/mean": 0.32625001668930054, + "rewards/reward_func/std": 0.5401041507720947, + "sampling/importance_sampling_ratio/max": 1.6197782754898071, + "sampling/importance_sampling_ratio/mean": 1.1549599170684814, + "sampling/importance_sampling_ratio/min": 0.8198900818824768, + "sampling/sampling_logp_difference/max": 0.28801941871643066, + "sampling/sampling_logp_difference/mean": 0.01991274021565914, + "step": 235, + "step_time": 33.60801069700392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.30618584156036377, + "epoch": 0.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3444198369979858, + "kl": 0.01667897030711174, + "learning_rate": 4.4689186089436365e-06, + "loss": 0.268, + "num_tokens": 1319260.0, + "reward": 0.08624999970197678, + "reward_std": 0.2683984041213989, + "rewards/reward_func/mean": 0.08624999970197678, + "rewards/reward_func/std": 0.35900411009788513, + "sampling/importance_sampling_ratio/max": 1.9262661933898926, + "sampling/importance_sampling_ratio/mean": 1.1909027099609375, + "sampling/importance_sampling_ratio/min": 0.7153211832046509, + "sampling/sampling_logp_difference/max": 0.4925405979156494, + "sampling/sampling_logp_difference/mean": 0.020151756703853607, + "step": 236, + "step_time": 40.63417864299845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 43.625, + "completions/mean_terminated_length": 43.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3836323022842407, + "epoch": 0.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1380821466445923, + "kl": 0.01986077055335045, + "learning_rate": 4.463918758111912e-06, + "loss": 0.0285, + "num_tokens": 1325046.0, + "reward": -0.03500000014901161, + "reward_std": 0.015408330596983433, + "rewards/reward_func/mean": -0.03500000014901161, + "rewards/reward_func/std": 0.03545621037483215, + "sampling/importance_sampling_ratio/max": 1.6017059087753296, + "sampling/importance_sampling_ratio/mean": 0.8632282018661499, + "sampling/importance_sampling_ratio/min": 0.3874957859516144, + "sampling/sampling_logp_difference/max": 0.42335569858551025, + "sampling/sampling_logp_difference/mean": 0.02702953666448593, + "step": 237, + "step_time": 47.89170832399395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 53.125, + "completions/mean_terminated_length": 53.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.343445360660553, + "epoch": 0.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6716041564941406, + "kl": 0.022784311324357986, + "learning_rate": 4.4588983067017255e-06, + "loss": -0.1186, + "num_tokens": 1331213.0, + "reward": 0.3474999964237213, + "reward_std": 0.5585798621177673, + "rewards/reward_func/mean": 0.3474999964237213, + "rewards/reward_func/std": 0.5411298274993896, + "sampling/importance_sampling_ratio/max": 1.6867166757583618, + "sampling/importance_sampling_ratio/mean": 1.0986011028289795, + "sampling/importance_sampling_ratio/min": 0.353174090385437, + "sampling/sampling_logp_difference/max": 0.5639722347259521, + "sampling/sampling_logp_difference/mean": 0.023318957537412643, + "step": 238, + "step_time": 36.82882352700108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.38994842767715454, + "epoch": 0.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0082672834396362, + "kl": 0.010606281459331512, + "learning_rate": 4.4538573073752365e-06, + "loss": 0.0566, + "num_tokens": 1336322.0, + "reward": 0.06499999761581421, + "reward_std": 0.28198638558387756, + "rewards/reward_func/mean": 0.06499999761581421, + "rewards/reward_func/std": 0.3786064684391022, + "sampling/importance_sampling_ratio/max": 1.146636962890625, + "sampling/importance_sampling_ratio/mean": 0.8590273261070251, + "sampling/importance_sampling_ratio/min": 0.4441068470478058, + "sampling/sampling_logp_difference/max": 0.47839975357055664, + "sampling/sampling_logp_difference/mean": 0.023978423327207565, + "step": 239, + "step_time": 40.39131770400854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.32200464606285095, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.661300778388977, + "kl": 0.01102864183485508, + "learning_rate": 4.448795813010142e-06, + "loss": -0.0057, + "num_tokens": 1341775.0, + "reward": 0.3475000262260437, + "reward_std": 0.5485206842422485, + "rewards/reward_func/mean": 0.3475000262260437, + "rewards/reward_func/std": 0.5300067663192749, + "sampling/importance_sampling_ratio/max": 1.252677321434021, + "sampling/importance_sampling_ratio/mean": 0.7707798480987549, + "sampling/importance_sampling_ratio/min": 0.4847687780857086, + "sampling/sampling_logp_difference/max": 0.5437004566192627, + "sampling/sampling_logp_difference/mean": 0.02054433897137642, + "step": 240, + "step_time": 40.95854337599303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3526754379272461, + "epoch": 0.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9396883249282837, + "kl": 0.01238167379051447, + "learning_rate": 4.443713876699124e-06, + "loss": 0.2082, + "num_tokens": 1347458.0, + "reward": 0.3349999785423279, + "reward_std": 0.5716516971588135, + "rewards/reward_func/mean": 0.3349999785423279, + "rewards/reward_func/std": 0.5458676218986511, + "sampling/importance_sampling_ratio/max": 1.5820939540863037, + "sampling/importance_sampling_ratio/mean": 0.9924447536468506, + "sampling/importance_sampling_ratio/min": 0.37981370091438293, + "sampling/sampling_logp_difference/max": 0.507704496383667, + "sampling/sampling_logp_difference/mean": 0.02272692322731018, + "step": 241, + "step_time": 35.560629765997874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3843768239021301, + "epoch": 0.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9482317566871643, + "kl": 0.016451558098196983, + "learning_rate": 4.438611551749288e-06, + "loss": 0.0273, + "num_tokens": 1353650.0, + "reward": 0.5887500047683716, + "reward_std": 0.2841942012310028, + "rewards/reward_func/mean": 0.5887500047683716, + "rewards/reward_func/std": 0.5169259905815125, + "sampling/importance_sampling_ratio/max": 1.5082266330718994, + "sampling/importance_sampling_ratio/mean": 0.8422431945800781, + "sampling/importance_sampling_ratio/min": 0.381510853767395, + "sampling/sampling_logp_difference/max": 0.3755103349685669, + "sampling/sampling_logp_difference/mean": 0.025023218244314194, + "step": 242, + "step_time": 37.83947860999615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.356048047542572, + "epoch": 0.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7156015634536743, + "kl": 0.032819997519254684, + "learning_rate": 4.4334888916816096e-06, + "loss": 0.1398, + "num_tokens": 1359023.0, + "reward": 0.05500000715255737, + "reward_std": 0.30337631702423096, + "rewards/reward_func/mean": 0.05500000715255737, + "rewards/reward_func/std": 0.38619017601013184, + "sampling/importance_sampling_ratio/max": 1.3351407051086426, + "sampling/importance_sampling_ratio/mean": 0.8373122811317444, + "sampling/importance_sampling_ratio/min": 0.15892520546913147, + "sampling/sampling_logp_difference/max": 0.9354909658432007, + "sampling/sampling_logp_difference/mean": 0.025785677134990692, + "step": 243, + "step_time": 40.353877371002454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.3383341431617737, + "epoch": 0.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6086397171020508, + "kl": 0.009784823283553123, + "learning_rate": 4.42834595023037e-06, + "loss": -0.1637, + "num_tokens": 1363662.0, + "reward": 0.46000000834465027, + "reward_std": 0.5897172689437866, + "rewards/reward_func/mean": 0.46000000834465027, + "rewards/reward_func/std": 0.5466260313987732, + "sampling/importance_sampling_ratio/max": 1.7768175601959229, + "sampling/importance_sampling_ratio/mean": 1.0315792560577393, + "sampling/importance_sampling_ratio/min": 0.5198686718940735, + "sampling/sampling_logp_difference/max": 0.41960763931274414, + "sampling/sampling_logp_difference/mean": 0.022437244653701782, + "step": 244, + "step_time": 31.396963074992527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.2941707968711853, + "epoch": 0.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.212313175201416, + "kl": 0.018501581624150276, + "learning_rate": 4.423182781342589e-06, + "loss": 0.0024, + "num_tokens": 1368921.0, + "reward": 0.1850000023841858, + "reward_std": 0.34044092893600464, + "rewards/reward_func/mean": 0.1850000023841858, + "rewards/reward_func/std": 0.5024511218070984, + "sampling/importance_sampling_ratio/max": 1.3489513397216797, + "sampling/importance_sampling_ratio/mean": 0.8641585111618042, + "sampling/importance_sampling_ratio/min": 0.25969162583351135, + "sampling/sampling_logp_difference/max": 0.4768340587615967, + "sampling/sampling_logp_difference/mean": 0.022857919335365295, + "step": 245, + "step_time": 40.36035842899582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.33388686180114746, + "epoch": 0.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8927440047264099, + "kl": 0.021221470087766647, + "learning_rate": 4.417999439177465e-06, + "loss": 0.0843, + "num_tokens": 1374815.0, + "reward": 0.3537500202655792, + "reward_std": 0.5465495586395264, + "rewards/reward_func/mean": 0.3537500202655792, + "rewards/reward_func/std": 0.5216988325119019, + "sampling/importance_sampling_ratio/max": 1.4427800178527832, + "sampling/importance_sampling_ratio/mean": 0.8960850238800049, + "sampling/importance_sampling_ratio/min": 0.3982952833175659, + "sampling/sampling_logp_difference/max": 0.543989896774292, + "sampling/sampling_logp_difference/mean": 0.022352319210767746, + "step": 246, + "step_time": 37.63646215200424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.30894017219543457, + "epoch": 0.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6700345277786255, + "kl": 0.016512254253029823, + "learning_rate": 4.412795978105807e-06, + "loss": -0.1905, + "num_tokens": 1380366.0, + "reward": 0.19624999165534973, + "reward_std": 0.5302596092224121, + "rewards/reward_func/mean": 0.19624999165534973, + "rewards/reward_func/std": 0.4911484718322754, + "sampling/importance_sampling_ratio/max": 1.5147849321365356, + "sampling/importance_sampling_ratio/mean": 0.7344578504562378, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.0827524662017822, + "sampling/sampling_logp_difference/mean": 0.02441200613975525, + "step": 247, + "step_time": 39.67168925999431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.36346104741096497, + "epoch": 0.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8582754135131836, + "kl": 0.013538405299186707, + "learning_rate": 4.407572452709459e-06, + "loss": -0.1299, + "num_tokens": 1385762.0, + "reward": 0.45249998569488525, + "reward_std": 0.6115769147872925, + "rewards/reward_func/mean": 0.45249998569488525, + "rewards/reward_func/std": 0.566385805606842, + "sampling/importance_sampling_ratio/max": 1.3523950576782227, + "sampling/importance_sampling_ratio/mean": 0.7945364117622375, + "sampling/importance_sampling_ratio/min": 0.34108811616897583, + "sampling/sampling_logp_difference/max": 0.6163909435272217, + "sampling/sampling_logp_difference/mean": 0.026093991473317146, + "step": 248, + "step_time": 34.434571129997494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 56.75, + "completions/mean_terminated_length": 56.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.38116583228111267, + "epoch": 0.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1535700559616089, + "kl": 0.011749478057026863, + "learning_rate": 4.402328917780728e-06, + "loss": -0.1905, + "num_tokens": 1391256.0, + "reward": 0.3187500238418579, + "reward_std": 0.5815805792808533, + "rewards/reward_func/mean": 0.3187500238418579, + "rewards/reward_func/std": 0.5606231093406677, + "sampling/importance_sampling_ratio/max": 1.868465542793274, + "sampling/importance_sampling_ratio/mean": 0.9490823745727539, + "sampling/importance_sampling_ratio/min": 0.3271576762199402, + "sampling/sampling_logp_difference/max": 0.7011950016021729, + "sampling/sampling_logp_difference/mean": 0.028965875506401062, + "step": 249, + "step_time": 35.004241451999405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.34539151191711426, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1324069499969482, + "kl": 0.01937723159790039, + "learning_rate": 4.397065428321818e-06, + "loss": -0.0797, + "num_tokens": 1396691.0, + "reward": 0.5799999833106995, + "reward_std": 0.559051513671875, + "rewards/reward_func/mean": 0.5799999833106995, + "rewards/reward_func/std": 0.536443293094635, + "sampling/importance_sampling_ratio/max": 1.660628080368042, + "sampling/importance_sampling_ratio/mean": 0.9389946460723877, + "sampling/importance_sampling_ratio/min": 0.34798264503479004, + "sampling/sampling_logp_difference/max": 0.4318201541900635, + "sampling/sampling_logp_difference/mean": 0.026005076244473457, + "step": 250, + "step_time": 39.65480143901368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.2987058758735657, + "epoch": 0.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6359943747520447, + "kl": 0.0072782086208462715, + "learning_rate": 4.391782039544239e-06, + "loss": 0.0501, + "num_tokens": 1402379.0, + "reward": 0.34375, + "reward_std": 0.5505853891372681, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5326735377311707, + "sampling/importance_sampling_ratio/max": 0.9640963673591614, + "sampling/importance_sampling_ratio/mean": 0.7049010396003723, + "sampling/importance_sampling_ratio/min": 0.3560267686843872, + "sampling/sampling_logp_difference/max": 0.34845149517059326, + "sampling/sampling_logp_difference/mean": 0.02234082669019699, + "step": 251, + "step_time": 40.00721578199591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3475812077522278, + "epoch": 0.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.947122573852539, + "kl": 0.012996821664273739, + "learning_rate": 4.386478806868242e-06, + "loss": -0.3883, + "num_tokens": 1407741.0, + "reward": 0.20624999701976776, + "reward_std": 0.5270916223526001, + "rewards/reward_func/mean": 0.20624999701976776, + "rewards/reward_func/std": 0.4883481562137604, + "sampling/importance_sampling_ratio/max": 2.576594352722168, + "sampling/importance_sampling_ratio/mean": 1.3187074661254883, + "sampling/importance_sampling_ratio/min": 0.5899462699890137, + "sampling/sampling_logp_difference/max": 0.4354560375213623, + "sampling/sampling_logp_difference/mean": 0.02314494550228119, + "step": 252, + "step_time": 36.81614575999265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.2992243468761444, + "epoch": 0.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.242324709892273, + "kl": 0.016226768493652344, + "learning_rate": 4.381155785922226e-06, + "loss": 0.0511, + "num_tokens": 1413174.0, + "reward": 0.32750001549720764, + "reward_std": 0.5569428205490112, + "rewards/reward_func/mean": 0.32750001549720764, + "rewards/reward_func/std": 0.5400992035865784, + "sampling/importance_sampling_ratio/max": 1.7072101831436157, + "sampling/importance_sampling_ratio/mean": 1.1310184001922607, + "sampling/importance_sampling_ratio/min": 0.5692970752716064, + "sampling/sampling_logp_difference/max": 0.4309711456298828, + "sampling/sampling_logp_difference/mean": 0.019555598497390747, + "step": 253, + "step_time": 36.43075303900696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.33849331736564636, + "epoch": 0.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3686050176620483, + "kl": 0.01706480048596859, + "learning_rate": 4.375813032542164e-06, + "loss": -0.0148, + "num_tokens": 1418790.0, + "reward": 0.2212499976158142, + "reward_std": 0.5178108215332031, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.47944724559783936, + "sampling/importance_sampling_ratio/max": 1.8649122714996338, + "sampling/importance_sampling_ratio/mean": 1.1233329772949219, + "sampling/importance_sampling_ratio/min": 0.349303662776947, + "sampling/sampling_logp_difference/max": 0.4339733123779297, + "sampling/sampling_logp_difference/mean": 0.023459486663341522, + "step": 254, + "step_time": 37.26218052499462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.4087204933166504, + "epoch": 0.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3738113641738892, + "kl": 0.03434944152832031, + "learning_rate": 4.37045060277101e-06, + "loss": -0.2582, + "num_tokens": 1424255.0, + "reward": 0.08124999701976776, + "reward_std": 0.284479558467865, + "rewards/reward_func/mean": 0.08124999701976776, + "rewards/reward_func/std": 0.3745640218257904, + "sampling/importance_sampling_ratio/max": 1.7234750986099243, + "sampling/importance_sampling_ratio/mean": 0.9960125684738159, + "sampling/importance_sampling_ratio/min": 0.38904088735580444, + "sampling/sampling_logp_difference/max": 0.6690880060195923, + "sampling/sampling_logp_difference/mean": 0.029072267934679985, + "step": 255, + "step_time": 41.67740165100258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 45.375, + "completions/mean_terminated_length": 45.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.33174747228622437, + "epoch": 0.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5522558689117432, + "kl": 0.024778520688414574, + "learning_rate": 4.365068552858116e-06, + "loss": -0.0654, + "num_tokens": 1430284.0, + "reward": 0.3137499988079071, + "reward_std": 0.5697016716003418, + "rewards/reward_func/mean": 0.3137499988079071, + "rewards/reward_func/std": 0.5417943000793457, + "sampling/importance_sampling_ratio/max": 1.6336612701416016, + "sampling/importance_sampling_ratio/mean": 0.8506813049316406, + "sampling/importance_sampling_ratio/min": 0.30304935574531555, + "sampling/sampling_logp_difference/max": 0.8601186275482178, + "sampling/sampling_logp_difference/mean": 0.03514246642589569, + "step": 256, + "step_time": 40.16726562000986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3332866132259369, + "epoch": 0.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.05963933467865, + "kl": 0.05120334029197693, + "learning_rate": 4.359666939258637e-06, + "loss": -0.143, + "num_tokens": 1435988.0, + "reward": 0.08000000566244125, + "reward_std": 0.2796610891819, + "rewards/reward_func/mean": 0.08000000566244125, + "rewards/reward_func/std": 0.3729420006275177, + "sampling/importance_sampling_ratio/max": 1.3755680322647095, + "sampling/importance_sampling_ratio/mean": 0.8436367511749268, + "sampling/importance_sampling_ratio/min": 0.3388536870479584, + "sampling/sampling_logp_difference/max": 1.2603816986083984, + "sampling/sampling_logp_difference/mean": 0.028931111097335815, + "step": 257, + "step_time": 41.19875999999931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.35343602299690247, + "epoch": 0.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8309938907623291, + "kl": 0.031175322830677032, + "learning_rate": 4.354245818632944e-06, + "loss": -0.0113, + "num_tokens": 1441589.0, + "reward": 0.32249999046325684, + "reward_std": 0.5780496597290039, + "rewards/reward_func/mean": 0.32249999046325684, + "rewards/reward_func/std": 0.5562823414802551, + "sampling/importance_sampling_ratio/max": 1.013479232788086, + "sampling/importance_sampling_ratio/mean": 0.7156726717948914, + "sampling/importance_sampling_ratio/min": 0.31125888228416443, + "sampling/sampling_logp_difference/max": 0.8599154949188232, + "sampling/sampling_logp_difference/mean": 0.029068203642964363, + "step": 258, + "step_time": 32.516722486005165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 49.875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.3191331624984741, + "epoch": 0.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0082281827926636, + "kl": 0.0865778848528862, + "learning_rate": 4.348805247846027e-06, + "loss": 0.004, + "num_tokens": 1447611.0, + "reward": 0.35625001788139343, + "reward_std": 0.2691054940223694, + "rewards/reward_func/mean": 0.35625001788139343, + "rewards/reward_func/std": 0.5278781652450562, + "sampling/importance_sampling_ratio/max": 1.7518271207809448, + "sampling/importance_sampling_ratio/mean": 0.9650399684906006, + "sampling/importance_sampling_ratio/min": 0.23583200573921204, + "sampling/sampling_logp_difference/max": 0.9998927116394043, + "sampling/sampling_logp_difference/mean": 0.026092462241649628, + "step": 259, + "step_time": 39.387949302996276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3031730651855469, + "epoch": 0.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7463305592536926, + "kl": 0.01488940604031086, + "learning_rate": 4.343345283966901e-06, + "loss": -0.1232, + "num_tokens": 1452554.0, + "reward": 0.32375001907348633, + "reward_std": 0.5791251063346863, + "rewards/reward_func/mean": 0.32375001907348633, + "rewards/reward_func/std": 0.5504786968231201, + "sampling/importance_sampling_ratio/max": 1.5403791666030884, + "sampling/importance_sampling_ratio/mean": 1.0024334192276, + "sampling/importance_sampling_ratio/min": 0.6112100481987, + "sampling/sampling_logp_difference/max": 0.3157918453216553, + "sampling/sampling_logp_difference/mean": 0.01894155889749527, + "step": 260, + "step_time": 36.034989626001334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.35179832577705383, + "epoch": 0.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0443854331970215, + "kl": 0.0215410478413105, + "learning_rate": 4.337865984268002e-06, + "loss": -0.4475, + "num_tokens": 1458072.0, + "reward": 0.4775000214576721, + "reward_std": 0.5977118015289307, + "rewards/reward_func/mean": 0.4775000214576721, + "rewards/reward_func/std": 0.5537083148956299, + "sampling/importance_sampling_ratio/max": 1.6334444284439087, + "sampling/importance_sampling_ratio/mean": 1.0017789602279663, + "sampling/importance_sampling_ratio/min": 0.33515238761901855, + "sampling/sampling_logp_difference/max": 0.43834519386291504, + "sampling/sampling_logp_difference/mean": 0.02636917680501938, + "step": 261, + "step_time": 29.26977622200502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 52.375, + "completions/mean_terminated_length": 52.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3282299041748047, + "epoch": 0.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.814078450202942, + "kl": 0.04120957478880882, + "learning_rate": 4.33236740622459e-06, + "loss": -0.017, + "num_tokens": 1464432.0, + "reward": 0.19875000417232513, + "reward_std": 0.47839945554733276, + "rewards/reward_func/mean": 0.19875000417232513, + "rewards/reward_func/std": 0.444311261177063, + "sampling/importance_sampling_ratio/max": 1.5190731287002563, + "sampling/importance_sampling_ratio/mean": 1.122732162475586, + "sampling/importance_sampling_ratio/min": 0.47047433257102966, + "sampling/sampling_logp_difference/max": 0.6926932334899902, + "sampling/sampling_logp_difference/mean": 0.022747617214918137, + "step": 262, + "step_time": 44.020091628990485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 57.25, + "completions/mean_terminated_length": 57.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.357543408870697, + "epoch": 0.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.761099636554718, + "kl": 0.055734507739543915, + "learning_rate": 4.326849607514149e-06, + "loss": 0.2722, + "num_tokens": 1469989.0, + "reward": 0.33500000834465027, + "reward_std": 0.546402633190155, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5292582511901855, + "sampling/importance_sampling_ratio/max": 1.8936946392059326, + "sampling/importance_sampling_ratio/mean": 0.8968003988265991, + "sampling/importance_sampling_ratio/min": 0.1377258002758026, + "sampling/sampling_logp_difference/max": 1.1581766605377197, + "sampling/sampling_logp_difference/mean": 0.030442828312516212, + "step": 263, + "step_time": 37.87564268500137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3408206105232239, + "epoch": 0.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1230255365371704, + "kl": 0.16760079562664032, + "learning_rate": 4.321312646015775e-06, + "loss": 0.0372, + "num_tokens": 1474707.0, + "reward": 0.4675000309944153, + "reward_std": 0.6010903120040894, + "rewards/reward_func/mean": 0.4675000309944153, + "rewards/reward_func/std": 0.5568469762802124, + "sampling/importance_sampling_ratio/max": 1.2672181129455566, + "sampling/importance_sampling_ratio/mean": 0.7914547920227051, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.173736572265625, + "sampling/sampling_logp_difference/mean": 0.028644565492868423, + "step": 264, + "step_time": 27.348199279993423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3111991286277771, + "epoch": 0.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8316683173179626, + "kl": 0.02439902350306511, + "learning_rate": 4.315756579809575e-06, + "loss": 0.0201, + "num_tokens": 1479489.0, + "reward": 0.0637499988079071, + "reward_std": 0.2797112762928009, + "rewards/reward_func/mean": 0.0637499988079071, + "rewards/reward_func/std": 0.3812362551689148, + "sampling/importance_sampling_ratio/max": 1.1946301460266113, + "sampling/importance_sampling_ratio/mean": 0.888809323310852, + "sampling/importance_sampling_ratio/min": 0.5050463676452637, + "sampling/sampling_logp_difference/max": 0.7642940282821655, + "sampling/sampling_logp_difference/mean": 0.02073574811220169, + "step": 265, + "step_time": 39.65966210300394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.32368624210357666, + "epoch": 0.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.282889485359192, + "kl": 0.029365966096520424, + "learning_rate": 4.3101814671760546e-06, + "loss": -0.3961, + "num_tokens": 1484739.0, + "reward": 0.07999999821186066, + "reward_std": 0.2752854824066162, + "rewards/reward_func/mean": 0.07999999821186066, + "rewards/reward_func/std": 0.3690141439437866, + "sampling/importance_sampling_ratio/max": 2.9063940048217773, + "sampling/importance_sampling_ratio/mean": 1.0859081745147705, + "sampling/importance_sampling_ratio/min": 0.20924752950668335, + "sampling/sampling_logp_difference/max": 0.8566403388977051, + "sampling/sampling_logp_difference/mean": 0.026934346184134483, + "step": 266, + "step_time": 38.628261532008764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.33091580867767334, + "epoch": 0.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8325208425521851, + "kl": 0.04467456787824631, + "learning_rate": 4.304587366595506e-06, + "loss": -0.1739, + "num_tokens": 1490415.0, + "reward": -0.04374999552965164, + "reward_std": 0.03216441348195076, + "rewards/reward_func/mean": -0.04374999552965164, + "rewards/reward_func/std": 0.04749060049653053, + "sampling/importance_sampling_ratio/max": 1.7320250272750854, + "sampling/importance_sampling_ratio/mean": 0.6510964632034302, + "sampling/importance_sampling_ratio/min": 0.38174349069595337, + "sampling/sampling_logp_difference/max": 0.8815808296203613, + "sampling/sampling_logp_difference/mean": 0.030356641858816147, + "step": 267, + "step_time": 39.55001159000676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3176058530807495, + "epoch": 0.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0742965936660767, + "kl": 0.022770049050450325, + "learning_rate": 4.298974336747397e-06, + "loss": -0.0013, + "num_tokens": 1495288.0, + "reward": 0.3375000059604645, + "reward_std": 0.5672158598899841, + "rewards/reward_func/mean": 0.3375000059604645, + "rewards/reward_func/std": 0.5421057343482971, + "sampling/importance_sampling_ratio/max": 1.8634246587753296, + "sampling/importance_sampling_ratio/mean": 1.0370787382125854, + "sampling/importance_sampling_ratio/min": 0.7139883637428284, + "sampling/sampling_logp_difference/max": 0.35963261127471924, + "sampling/sampling_logp_difference/mean": 0.025577042251825333, + "step": 268, + "step_time": 28.42022311600158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3482901453971863, + "epoch": 0.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.574179172515869, + "kl": 0.020543672144412994, + "learning_rate": 4.2933424365097565e-06, + "loss": -0.3582, + "num_tokens": 1501351.0, + "reward": 0.19875000417232513, + "reward_std": 0.5214530229568481, + "rewards/reward_func/mean": 0.19875000417232513, + "rewards/reward_func/std": 0.48300954699516296, + "sampling/importance_sampling_ratio/max": 2.9328081607818604, + "sampling/importance_sampling_ratio/mean": 1.0341973304748535, + "sampling/importance_sampling_ratio/min": 0.38898590207099915, + "sampling/sampling_logp_difference/max": 0.5442459583282471, + "sampling/sampling_logp_difference/mean": 0.024556942284107208, + "step": 269, + "step_time": 41.42594156700943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 46.125, + "completions/mean_terminated_length": 46.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.33427339792251587, + "epoch": 0.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8722326159477234, + "kl": 0.0236725602298975, + "learning_rate": 4.287691724958551e-06, + "loss": -0.0266, + "num_tokens": 1506597.0, + "reward": -0.05250000208616257, + "reward_std": 0.053679704666137695, + "rewards/reward_func/mean": -0.05250000208616257, + "rewards/reward_func/std": 0.05391792953014374, + "sampling/importance_sampling_ratio/max": 0.9804509878158569, + "sampling/importance_sampling_ratio/mean": 0.7802181839942932, + "sampling/importance_sampling_ratio/min": 0.44377338886260986, + "sampling/sampling_logp_difference/max": 0.7215894460678101, + "sampling/sampling_logp_difference/mean": 0.02426327019929886, + "step": 270, + "step_time": 43.879117930002394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3636426031589508, + "epoch": 0.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8976055979728699, + "kl": 0.025950593873858452, + "learning_rate": 4.282022261367074e-06, + "loss": -0.0976, + "num_tokens": 1512282.0, + "reward": 0.5975000262260437, + "reward_std": 0.541487455368042, + "rewards/reward_func/mean": 0.5975000262260437, + "rewards/reward_func/std": 0.5228424668312073, + "sampling/importance_sampling_ratio/max": 2.1136491298675537, + "sampling/importance_sampling_ratio/mean": 0.9269047379493713, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.7309963703155518, + "sampling/sampling_logp_difference/mean": 0.02674085833132267, + "step": 271, + "step_time": 34.07076389199938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3050827085971832, + "epoch": 0.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4390958547592163, + "kl": 0.03130275756120682, + "learning_rate": 4.276334105205312e-06, + "loss": 0.0677, + "num_tokens": 1517354.0, + "reward": -0.038750000298023224, + "reward_std": 0.03077373281121254, + "rewards/reward_func/mean": -0.038750000298023224, + "rewards/reward_func/std": 0.0425734668970108, + "sampling/importance_sampling_ratio/max": 1.9479724168777466, + "sampling/importance_sampling_ratio/mean": 1.1777501106262207, + "sampling/importance_sampling_ratio/min": 0.8560696840286255, + "sampling/sampling_logp_difference/max": 0.40088653564453125, + "sampling/sampling_logp_difference/mean": 0.02281448245048523, + "step": 272, + "step_time": 33.767667939988314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 57.375, + "completions/mean_terminated_length": 57.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35996952652931213, + "epoch": 0.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6506816148757935, + "kl": 0.018656428903341293, + "learning_rate": 4.270627316139333e-06, + "loss": 0.0589, + "num_tokens": 1523210.0, + "reward": 0.1912499964237213, + "reward_std": 0.5389974117279053, + "rewards/reward_func/mean": 0.1912499964237213, + "rewards/reward_func/std": 0.4990401864051819, + "sampling/importance_sampling_ratio/max": 1.3763803243637085, + "sampling/importance_sampling_ratio/mean": 0.8559229969978333, + "sampling/importance_sampling_ratio/min": 0.3611724078655243, + "sampling/sampling_logp_difference/max": 0.7112209796905518, + "sampling/sampling_logp_difference/mean": 0.025731002911925316, + "step": 273, + "step_time": 43.23826266299875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3215975761413574, + "epoch": 0.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8488715887069702, + "kl": 0.0160782802850008, + "learning_rate": 4.264901954030655e-06, + "loss": 0.0883, + "num_tokens": 1528646.0, + "reward": 0.22625000774860382, + "reward_std": 0.5068466663360596, + "rewards/reward_func/mean": 0.22625000774860382, + "rewards/reward_func/std": 0.46940505504608154, + "sampling/importance_sampling_ratio/max": 1.3615248203277588, + "sampling/importance_sampling_ratio/mean": 0.8312337398529053, + "sampling/importance_sampling_ratio/min": 0.3102983236312866, + "sampling/sampling_logp_difference/max": 0.6900758743286133, + "sampling/sampling_logp_difference/mean": 0.023976415395736694, + "step": 274, + "step_time": 34.28163969999878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 55.5, + "completions/mean_terminated_length": 55.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.31386834383010864, + "epoch": 0.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1750917434692383, + "kl": 0.030527833849191666, + "learning_rate": 4.259158078935616e-06, + "loss": -0.12, + "num_tokens": 1534719.0, + "reward": 0.4662500023841858, + "reward_std": 0.5187262296676636, + "rewards/reward_func/mean": 0.4662500023841858, + "rewards/reward_func/std": 0.5655575394630432, + "sampling/importance_sampling_ratio/max": 2.275832414627075, + "sampling/importance_sampling_ratio/mean": 1.2086929082870483, + "sampling/importance_sampling_ratio/min": 0.4686194062232971, + "sampling/sampling_logp_difference/max": 0.6889495849609375, + "sampling/sampling_logp_difference/mean": 0.02524813637137413, + "step": 275, + "step_time": 41.8534838570049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34471046924591064, + "epoch": 0.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9192721843719482, + "kl": 0.02106521837413311, + "learning_rate": 4.2533957511047485e-06, + "loss": 0.1216, + "num_tokens": 1540498.0, + "reward": 0.33124998211860657, + "reward_std": 0.5458966493606567, + "rewards/reward_func/mean": 0.33124998211860657, + "rewards/reward_func/std": 0.5239530205726624, + "sampling/importance_sampling_ratio/max": 1.4334321022033691, + "sampling/importance_sampling_ratio/mean": 0.9884998798370361, + "sampling/importance_sampling_ratio/min": 0.6590894460678101, + "sampling/sampling_logp_difference/max": 0.477333664894104, + "sampling/sampling_logp_difference/mean": 0.026044394820928574, + "step": 276, + "step_time": 43.16134984300879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 45.375, + "completions/mean_terminated_length": 45.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.33258211612701416, + "epoch": 0.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4911836385726929, + "kl": 0.019767843186855316, + "learning_rate": 4.247615030982144e-06, + "loss": -0.3075, + "num_tokens": 1546073.0, + "reward": 0.21875, + "reward_std": 0.5179763436317444, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.479804664850235, + "sampling/importance_sampling_ratio/max": 1.665633201599121, + "sampling/importance_sampling_ratio/mean": 0.8182989954948425, + "sampling/importance_sampling_ratio/min": 0.2058224380016327, + "sampling/sampling_logp_difference/max": 0.527796745300293, + "sampling/sampling_logp_difference/mean": 0.027309097349643707, + "step": 277, + "step_time": 35.166972905994044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 49.875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3460892140865326, + "epoch": 0.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7702014446258545, + "kl": 0.013081185519695282, + "learning_rate": 4.241815979204822e-06, + "loss": 0.2004, + "num_tokens": 1552244.0, + "reward": 0.35750001668930054, + "reward_std": 0.5452839136123657, + "rewards/reward_func/mean": 0.35750001668930054, + "rewards/reward_func/std": 0.5240433812141418, + "sampling/importance_sampling_ratio/max": 1.3788411617279053, + "sampling/importance_sampling_ratio/mean": 0.8693941235542297, + "sampling/importance_sampling_ratio/min": 0.21280725300312042, + "sampling/sampling_logp_difference/max": 0.6975330710411072, + "sampling/sampling_logp_difference/mean": 0.022449234500527382, + "step": 278, + "step_time": 35.046001802998944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 49.125, + "completions/mean_terminated_length": 49.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.34101182222366333, + "epoch": 0.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.937264621257782, + "kl": 0.03165788948535919, + "learning_rate": 4.235998656602091e-06, + "loss": -0.0022, + "num_tokens": 1558256.0, + "reward": 0.19625000655651093, + "reward_std": 0.331611692905426, + "rewards/reward_func/mean": 0.19625000655651093, + "rewards/reward_func/std": 0.48567885160446167, + "sampling/importance_sampling_ratio/max": 1.5198726654052734, + "sampling/importance_sampling_ratio/mean": 1.026062250137329, + "sampling/importance_sampling_ratio/min": 0.6299855709075928, + "sampling/sampling_logp_difference/max": 0.6371855735778809, + "sampling/sampling_logp_difference/mean": 0.02409491315484047, + "step": 279, + "step_time": 43.56885852699634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.36255669593811035, + "epoch": 0.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7901662588119507, + "kl": 0.013471327722072601, + "learning_rate": 4.230163124194913e-06, + "loss": -0.1862, + "num_tokens": 1564016.0, + "reward": -0.04874999821186066, + "reward_std": 0.06202464923262596, + "rewards/reward_func/mean": -0.04874999821186066, + "rewards/reward_func/std": 0.062435686588287354, + "sampling/importance_sampling_ratio/max": 1.5341285467147827, + "sampling/importance_sampling_ratio/mean": 0.8721048831939697, + "sampling/importance_sampling_ratio/min": 0.4120750427246094, + "sampling/sampling_logp_difference/max": 0.49291160702705383, + "sampling/sampling_logp_difference/mean": 0.02896583452820778, + "step": 280, + "step_time": 38.55246208499011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 55.375, + "completions/mean_terminated_length": 55.375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.33529117703437805, + "epoch": 0.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5709347724914551, + "kl": 0.01782386749982834, + "learning_rate": 4.224309443195261e-06, + "loss": -0.1666, + "num_tokens": 1569853.0, + "reward": 0.5987499952316284, + "reward_std": 0.5424566864967346, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.5208903551101685, + "sampling/importance_sampling_ratio/max": 1.183791995048523, + "sampling/importance_sampling_ratio/mean": 0.8074554204940796, + "sampling/importance_sampling_ratio/min": 0.19970110058784485, + "sampling/sampling_logp_difference/max": 0.8792259097099304, + "sampling/sampling_logp_difference/mean": 0.02342084050178528, + "step": 281, + "step_time": 36.61759810600779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 57.375, + "completions/mean_terminated_length": 57.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3397712707519531, + "epoch": 0.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9551227688789368, + "kl": 0.03373869135975838, + "learning_rate": 4.218437675005479e-06, + "loss": -0.0608, + "num_tokens": 1575868.0, + "reward": 0.33500000834465027, + "reward_std": 0.2718449532985687, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.540925145149231, + "sampling/importance_sampling_ratio/max": 1.8021279573440552, + "sampling/importance_sampling_ratio/mean": 1.119571566581726, + "sampling/importance_sampling_ratio/min": 0.7593724727630615, + "sampling/sampling_logp_difference/max": 0.6536552906036377, + "sampling/sampling_logp_difference/mean": 0.023292632773518562, + "step": 282, + "step_time": 45.945890542003326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.36750608682632446, + "epoch": 0.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7954268455505371, + "kl": 0.017600055783987045, + "learning_rate": 4.212547881217637e-06, + "loss": 0.0046, + "num_tokens": 1582017.0, + "reward": 0.48374998569488525, + "reward_std": 0.5109193921089172, + "rewards/reward_func/mean": 0.48374998569488525, + "rewards/reward_func/std": 0.5493616461753845, + "sampling/importance_sampling_ratio/max": 1.646396517753601, + "sampling/importance_sampling_ratio/mean": 0.9751088619232178, + "sampling/importance_sampling_ratio/min": 0.5646697282791138, + "sampling/sampling_logp_difference/max": 0.4884145259857178, + "sampling/sampling_logp_difference/mean": 0.024717465043067932, + "step": 283, + "step_time": 35.48707418401318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.2997041642665863, + "epoch": 0.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9606561660766602, + "kl": 0.021286074072122574, + "learning_rate": 4.206640123612885e-06, + "loss": 0.034, + "num_tokens": 1587373.0, + "reward": 0.5887500047683716, + "reward_std": 0.5757082104682922, + "rewards/reward_func/mean": 0.5887500047683716, + "rewards/reward_func/std": 0.551217794418335, + "sampling/importance_sampling_ratio/max": 1.1931737661361694, + "sampling/importance_sampling_ratio/mean": 0.8760452270507812, + "sampling/importance_sampling_ratio/min": 0.608650267124176, + "sampling/sampling_logp_difference/max": 0.6686441898345947, + "sampling/sampling_logp_difference/mean": 0.022089166566729546, + "step": 284, + "step_time": 30.727206259995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3434939980506897, + "epoch": 0.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.880317211151123, + "kl": 0.013526841066777706, + "learning_rate": 4.2007144641608035e-06, + "loss": 0.0272, + "num_tokens": 1592909.0, + "reward": 0.2224999964237213, + "reward_std": 0.5191160440444946, + "rewards/reward_func/mean": 0.2224999964237213, + "rewards/reward_func/std": 0.4809737205505371, + "sampling/importance_sampling_ratio/max": 1.8023751974105835, + "sampling/importance_sampling_ratio/mean": 0.8636828064918518, + "sampling/importance_sampling_ratio/min": 0.35050126910209656, + "sampling/sampling_logp_difference/max": 0.8070380687713623, + "sampling/sampling_logp_difference/mean": 0.022738801315426826, + "step": 285, + "step_time": 35.6797738460009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.32169610261917114, + "epoch": 0.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8008424043655396, + "kl": 0.016660645604133606, + "learning_rate": 4.194770965018758e-06, + "loss": 0.093, + "num_tokens": 1598805.0, + "reward": 0.3537500202655792, + "reward_std": 0.2641814053058624, + "rewards/reward_func/mean": 0.3537500202655792, + "rewards/reward_func/std": 0.5189808011054993, + "sampling/importance_sampling_ratio/max": 1.3268340826034546, + "sampling/importance_sampling_ratio/mean": 0.8526763319969177, + "sampling/importance_sampling_ratio/min": 0.5635418891906738, + "sampling/sampling_logp_difference/max": 0.5913662910461426, + "sampling/sampling_logp_difference/mean": 0.0236099511384964, + "step": 286, + "step_time": 45.18931570999848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.31263816356658936, + "epoch": 0.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.337044358253479, + "kl": 0.03852120786905289, + "learning_rate": 4.188809688531241e-06, + "loss": 0.18, + "num_tokens": 1604416.0, + "reward": 0.2225000113248825, + "reward_std": 0.2900446355342865, + "rewards/reward_func/mean": 0.2225000113248825, + "rewards/reward_func/std": 0.46191370487213135, + "sampling/importance_sampling_ratio/max": 1.3615566492080688, + "sampling/importance_sampling_ratio/mean": 0.8375392556190491, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9422614574432373, + "sampling/sampling_logp_difference/mean": 0.02491070330142975, + "step": 287, + "step_time": 41.96923489900655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.33257660269737244, + "epoch": 0.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0255043506622314, + "kl": 0.02244836464524269, + "learning_rate": 4.182830697229223e-06, + "loss": 0.1785, + "num_tokens": 1610302.0, + "reward": 0.34375, + "reward_std": 0.5376863479614258, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.516193151473999, + "sampling/importance_sampling_ratio/max": 1.905393362045288, + "sampling/importance_sampling_ratio/mean": 1.0131361484527588, + "sampling/importance_sampling_ratio/min": 0.235921248793602, + "sampling/sampling_logp_difference/max": 0.5580523014068604, + "sampling/sampling_logp_difference/mean": 0.025940991938114166, + "step": 288, + "step_time": 40.204126273994916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 60.0, + "completions/mean_terminated_length": 60.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3540540933609009, + "epoch": 0.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8518127202987671, + "kl": 0.01784059964120388, + "learning_rate": 4.176834053829492e-06, + "loss": 0.2212, + "num_tokens": 1615656.0, + "reward": 0.07500000298023224, + "reward_std": 0.2555869221687317, + "rewards/reward_func/mean": 0.07500000298023224, + "rewards/reward_func/std": 0.32802441716194153, + "sampling/importance_sampling_ratio/max": 1.4949760437011719, + "sampling/importance_sampling_ratio/mean": 0.8921905159950256, + "sampling/importance_sampling_ratio/min": 0.345406174659729, + "sampling/sampling_logp_difference/max": 0.3340027332305908, + "sampling/sampling_logp_difference/mean": 0.024094369262456894, + "step": 289, + "step_time": 42.1554738059931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 46.375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.33654820919036865, + "epoch": 0.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7503964900970459, + "kl": 0.015095679089426994, + "learning_rate": 4.170819821234001e-06, + "loss": -0.026, + "num_tokens": 1621328.0, + "reward": 0.5987499952316284, + "reward_std": 0.5505416393280029, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.5330354571342468, + "sampling/importance_sampling_ratio/max": 1.6455559730529785, + "sampling/importance_sampling_ratio/mean": 0.9360400438308716, + "sampling/importance_sampling_ratio/min": 0.4872363209724426, + "sampling/sampling_logp_difference/max": 0.3570232391357422, + "sampling/sampling_logp_difference/mean": 0.020733939483761787, + "step": 290, + "step_time": 33.462042975996155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3192588984966278, + "epoch": 0.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.532139778137207, + "kl": 0.03818577155470848, + "learning_rate": 4.164788062529203e-06, + "loss": -0.0941, + "num_tokens": 1626908.0, + "reward": 0.19750000536441803, + "reward_std": 0.532228946685791, + "rewards/reward_func/mean": 0.19750000536441803, + "rewards/reward_func/std": 0.494534432888031, + "sampling/importance_sampling_ratio/max": 1.8081889152526855, + "sampling/importance_sampling_ratio/mean": 1.2644798755645752, + "sampling/importance_sampling_ratio/min": 0.8442208170890808, + "sampling/sampling_logp_difference/max": 0.6289647817611694, + "sampling/sampling_logp_difference/mean": 0.02317667007446289, + "step": 291, + "step_time": 36.49553343700245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3839607238769531, + "epoch": 0.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7589607238769531, + "kl": 0.0189202930778265, + "learning_rate": 4.158738840985393e-06, + "loss": -0.2175, + "num_tokens": 1632438.0, + "reward": 0.3112500011920929, + "reward_std": 0.5763314366340637, + "rewards/reward_func/mean": 0.3112500011920929, + "rewards/reward_func/std": 0.5487502217292786, + "sampling/importance_sampling_ratio/max": 2.402517318725586, + "sampling/importance_sampling_ratio/mean": 1.3916277885437012, + "sampling/importance_sampling_ratio/min": 0.8243075013160706, + "sampling/sampling_logp_difference/max": 0.3067154884338379, + "sampling/sampling_logp_difference/mean": 0.023358281701803207, + "step": 292, + "step_time": 39.84433910700318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 45.5, + "completions/mean_terminated_length": 45.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3257955014705658, + "epoch": 0.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6930134296417236, + "kl": 0.03379783034324646, + "learning_rate": 4.1526722200560445e-06, + "loss": -0.252, + "num_tokens": 1637919.0, + "reward": 0.3362500071525574, + "reward_std": 0.5554932355880737, + "rewards/reward_func/mean": 0.3362500071525574, + "rewards/reward_func/std": 0.5346009731292725, + "sampling/importance_sampling_ratio/max": 1.87800133228302, + "sampling/importance_sampling_ratio/mean": 0.9086008071899414, + "sampling/importance_sampling_ratio/min": 0.3146730959415436, + "sampling/sampling_logp_difference/max": 1.1332507133483887, + "sampling/sampling_logp_difference/mean": 0.030632250010967255, + "step": 293, + "step_time": 28.30901631899178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3299206495285034, + "epoch": 0.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7860206961631775, + "kl": 0.012436825782060623, + "learning_rate": 4.146588263377137e-06, + "loss": -0.22, + "num_tokens": 1643427.0, + "reward": 0.07249999791383743, + "reward_std": 0.2847173810005188, + "rewards/reward_func/mean": 0.07249999791383743, + "rewards/reward_func/std": 0.3756803572177887, + "sampling/importance_sampling_ratio/max": 1.1498537063598633, + "sampling/importance_sampling_ratio/mean": 0.8128846883773804, + "sampling/importance_sampling_ratio/min": 0.3219602108001709, + "sampling/sampling_logp_difference/max": 0.7047772407531738, + "sampling/sampling_logp_difference/mean": 0.02216794341802597, + "step": 294, + "step_time": 33.63188786900719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3545665740966797, + "epoch": 0.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8703710436820984, + "kl": 0.01798795722424984, + "learning_rate": 4.140487034766499e-06, + "loss": -0.0911, + "num_tokens": 1649592.0, + "reward": 0.08624999970197678, + "reward_std": 0.2947118580341339, + "rewards/reward_func/mean": 0.08624999970197678, + "rewards/reward_func/std": 0.3655109107494354, + "sampling/importance_sampling_ratio/max": 1.07265043258667, + "sampling/importance_sampling_ratio/mean": 0.7273514866828918, + "sampling/importance_sampling_ratio/min": 0.44319045543670654, + "sampling/sampling_logp_difference/max": 0.669346034526825, + "sampling/sampling_logp_difference/mean": 0.02738836780190468, + "step": 295, + "step_time": 43.65301357599674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 54.875, + "completions/mean_terminated_length": 54.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3765920400619507, + "epoch": 0.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7091373205184937, + "kl": 0.022772938013076782, + "learning_rate": 4.134368598223132e-06, + "loss": -0.2819, + "num_tokens": 1654960.0, + "reward": 0.05500000715255737, + "reward_std": 0.2984171509742737, + "rewards/reward_func/mean": 0.05500000715255737, + "rewards/reward_func/std": 0.3879249095916748, + "sampling/importance_sampling_ratio/max": 2.3806519508361816, + "sampling/importance_sampling_ratio/mean": 1.4367578029632568, + "sampling/importance_sampling_ratio/min": 0.5037611722946167, + "sampling/sampling_logp_difference/max": 0.7325538396835327, + "sampling/sampling_logp_difference/mean": 0.0233943872153759, + "step": 296, + "step_time": 45.81175713399716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 54.625, + "completions/mean_terminated_length": 54.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3774474859237671, + "epoch": 0.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8588233590126038, + "kl": 0.018994109705090523, + "learning_rate": 4.128233017926538e-06, + "loss": 0.1595, + "num_tokens": 1660389.0, + "reward": 0.20500001311302185, + "reward_std": 0.512452244758606, + "rewards/reward_func/mean": 0.20500001311302185, + "rewards/reward_func/std": 0.47449222207069397, + "sampling/importance_sampling_ratio/max": 1.57711660861969, + "sampling/importance_sampling_ratio/mean": 0.9654600620269775, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6728287935256958, + "sampling/sampling_logp_difference/mean": 0.022220637649297714, + "step": 297, + "step_time": 39.472764768011984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 55.75, + "completions/mean_terminated_length": 55.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3656435012817383, + "epoch": 0.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9628560543060303, + "kl": 0.019060224294662476, + "learning_rate": 4.1220803582360545e-06, + "loss": 0.0337, + "num_tokens": 1666080.0, + "reward": 0.07625000178813934, + "reward_std": 0.2737163305282593, + "rewards/reward_func/mean": 0.07625000178813934, + "rewards/reward_func/std": 0.35940176248550415, + "sampling/importance_sampling_ratio/max": 1.6562925577163696, + "sampling/importance_sampling_ratio/mean": 0.9328627586364746, + "sampling/importance_sampling_ratio/min": 0.4477497935295105, + "sampling/sampling_logp_difference/max": 0.44923925399780273, + "sampling/sampling_logp_difference/mean": 0.028040671721100807, + "step": 298, + "step_time": 39.901188599003945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 60.625, + "completions/mean_terminated_length": 60.625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.34412166476249695, + "epoch": 0.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6345298886299133, + "kl": 0.02608601748943329, + "learning_rate": 4.115910683690167e-06, + "loss": 0.0772, + "num_tokens": 1671266.0, + "reward": 0.3125, + "reward_std": 0.5876922607421875, + "rewards/reward_func/mean": 0.3125, + "rewards/reward_func/std": 0.5627166628837585, + "sampling/importance_sampling_ratio/max": 1.680901050567627, + "sampling/importance_sampling_ratio/mean": 0.8350234031677246, + "sampling/importance_sampling_ratio/min": 0.4430753290653229, + "sampling/sampling_logp_difference/max": 0.4961535930633545, + "sampling/sampling_logp_difference/mean": 0.020545832812786102, + "step": 299, + "step_time": 28.69361345600919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 52.125, + "completions/mean_terminated_length": 52.125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3159938156604767, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.383638620376587, + "kl": 0.02010868862271309, + "learning_rate": 4.109724059005844e-06, + "loss": 0.0037, + "num_tokens": 1676792.0, + "reward": -0.0625, + "reward_std": 0.036226607859134674, + "rewards/reward_func/mean": -0.0625, + "rewards/reward_func/std": 0.04949747398495674, + "sampling/importance_sampling_ratio/max": 1.9275904893875122, + "sampling/importance_sampling_ratio/mean": 1.0863583087921143, + "sampling/importance_sampling_ratio/min": 0.5999199748039246, + "sampling/sampling_logp_difference/max": 0.6911392211914062, + "sampling/sampling_logp_difference/mean": 0.01991921104490757, + "step": 300, + "step_time": 38.05700801000057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.33787450194358826, + "epoch": 0.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9712547063827515, + "kl": 0.020664069801568985, + "learning_rate": 4.1035205490778505e-06, + "loss": 0.043, + "num_tokens": 1682614.0, + "reward": 0.20000001788139343, + "reward_std": 0.31392204761505127, + "rewards/reward_func/mean": 0.20000001788139343, + "rewards/reward_func/std": 0.46882835030555725, + "sampling/importance_sampling_ratio/max": 1.7084823846817017, + "sampling/importance_sampling_ratio/mean": 1.151247262954712, + "sampling/importance_sampling_ratio/min": 0.4232744872570038, + "sampling/sampling_logp_difference/max": 0.7267614006996155, + "sampling/sampling_logp_difference/mean": 0.02674541436135769, + "step": 301, + "step_time": 44.15970336600731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 55.375, + "completions/mean_terminated_length": 55.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.35346049070358276, + "epoch": 0.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0315412282943726, + "kl": 0.019526183605194092, + "learning_rate": 4.09730021897807e-06, + "loss": 0.0904, + "num_tokens": 1688667.0, + "reward": 0.45750001072883606, + "reward_std": 0.5982871055603027, + "rewards/reward_func/mean": 0.45750001072883606, + "rewards/reward_func/std": 0.5541467070579529, + "sampling/importance_sampling_ratio/max": 1.6532517671585083, + "sampling/importance_sampling_ratio/mean": 1.1429578065872192, + "sampling/importance_sampling_ratio/min": 0.5493549108505249, + "sampling/sampling_logp_difference/max": 0.7110903263092041, + "sampling/sampling_logp_difference/mean": 0.02418721280992031, + "step": 302, + "step_time": 36.97923886300123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3902936577796936, + "epoch": 0.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2959387302398682, + "kl": 0.022301558405160904, + "learning_rate": 4.091063133954821e-06, + "loss": 0.1146, + "num_tokens": 1694644.0, + "reward": 0.35249999165534973, + "reward_std": 0.545343279838562, + "rewards/reward_func/mean": 0.35249999165534973, + "rewards/reward_func/std": 0.5202403664588928, + "sampling/importance_sampling_ratio/max": 2.390007257461548, + "sampling/importance_sampling_ratio/mean": 0.9220224618911743, + "sampling/importance_sampling_ratio/min": 0.2947590947151184, + "sampling/sampling_logp_difference/max": 0.6841628551483154, + "sampling/sampling_logp_difference/mean": 0.03195720165967941, + "step": 303, + "step_time": 32.29864318299224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 56.125, + "completions/mean_terminated_length": 56.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.3676874041557312, + "epoch": 0.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6491054892539978, + "kl": 0.00750330463051796, + "learning_rate": 4.084809359432175e-06, + "loss": -0.1916, + "num_tokens": 1700155.0, + "reward": 0.3462499976158142, + "reward_std": 0.5419092178344727, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5199158787727356, + "sampling/importance_sampling_ratio/max": 1.2273759841918945, + "sampling/importance_sampling_ratio/mean": 0.6842429637908936, + "sampling/importance_sampling_ratio/min": 0.40895017981529236, + "sampling/sampling_logp_difference/max": 0.3567380905151367, + "sampling/sampling_logp_difference/mean": 0.02262764982879162, + "step": 304, + "step_time": 40.56021591799799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34539082646369934, + "epoch": 0.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7469095587730408, + "kl": 0.02391725406050682, + "learning_rate": 4.0785389610092684e-06, + "loss": 0.0006, + "num_tokens": 1705982.0, + "reward": 0.3374999761581421, + "reward_std": 0.5472263097763062, + "rewards/reward_func/mean": 0.3374999761581421, + "rewards/reward_func/std": 0.5298989415168762, + "sampling/importance_sampling_ratio/max": 1.1961729526519775, + "sampling/importance_sampling_ratio/mean": 0.8354759216308594, + "sampling/importance_sampling_ratio/min": 0.4396790862083435, + "sampling/sampling_logp_difference/max": 0.4689488410949707, + "sampling/sampling_logp_difference/mean": 0.023250753059983253, + "step": 305, + "step_time": 38.657245167996734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 56.0, + "completions/mean_terminated_length": 56.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.33104047179222107, + "epoch": 0.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3925511837005615, + "kl": 0.03172261267900467, + "learning_rate": 4.072252004459612e-06, + "loss": 0.0256, + "num_tokens": 1711741.0, + "reward": 0.036250002682209015, + "reward_std": 0.2732139229774475, + "rewards/reward_func/mean": 0.036250002682209015, + "rewards/reward_func/std": 0.35991817712783813, + "sampling/importance_sampling_ratio/max": 2.1978962421417236, + "sampling/importance_sampling_ratio/mean": 1.1438350677490234, + "sampling/importance_sampling_ratio/min": 0.12577073276042938, + "sampling/sampling_logp_difference/max": 0.8553478717803955, + "sampling/sampling_logp_difference/mean": 0.027399186044931412, + "step": 306, + "step_time": 42.64118599900394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 57.5, + "completions/mean_terminated_length": 57.5, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.37816122174263, + "epoch": 0.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8967448472976685, + "kl": 0.01576385274529457, + "learning_rate": 4.065948555730405e-06, + "loss": -0.0581, + "num_tokens": 1717802.0, + "reward": 0.04249999672174454, + "reward_std": 0.29508817195892334, + "rewards/reward_func/mean": 0.04249999672174454, + "rewards/reward_func/std": 0.37174299359321594, + "sampling/importance_sampling_ratio/max": 1.6180919408798218, + "sampling/importance_sampling_ratio/mean": 1.0993964672088623, + "sampling/importance_sampling_ratio/min": 0.5014515519142151, + "sampling/sampling_logp_difference/max": 0.4994962215423584, + "sampling/sampling_logp_difference/mean": 0.024644112214446068, + "step": 307, + "step_time": 43.42916698999761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.2961156666278839, + "epoch": 0.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7969692945480347, + "kl": 0.02517705410718918, + "learning_rate": 4.059628680941843e-06, + "loss": -0.0153, + "num_tokens": 1723439.0, + "reward": 0.32624998688697815, + "reward_std": 0.5344488620758057, + "rewards/reward_func/mean": 0.32624998688697815, + "rewards/reward_func/std": 0.521123468875885, + "sampling/importance_sampling_ratio/max": 1.6039814949035645, + "sampling/importance_sampling_ratio/mean": 0.8779284358024597, + "sampling/importance_sampling_ratio/min": 0.5089890956878662, + "sampling/sampling_logp_difference/max": 0.86330646276474, + "sampling/sampling_logp_difference/mean": 0.023059822618961334, + "step": 308, + "step_time": 39.59198473599099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3278340995311737, + "epoch": 0.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0694993734359741, + "kl": 0.040127597749233246, + "learning_rate": 4.053292446386422e-06, + "loss": 0.0202, + "num_tokens": 1728596.0, + "reward": 0.7137500047683716, + "reward_std": 0.32740655541419983, + "rewards/reward_func/mean": 0.7137500047683716, + "rewards/reward_func/std": 0.49684542417526245, + "sampling/importance_sampling_ratio/max": 1.6734663248062134, + "sampling/importance_sampling_ratio/mean": 0.9968644976615906, + "sampling/importance_sampling_ratio/min": 0.4512510895729065, + "sampling/sampling_logp_difference/max": 0.7595298290252686, + "sampling/sampling_logp_difference/mean": 0.026049617677927017, + "step": 309, + "step_time": 34.06555557799584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.625, + "completions/mean_terminated_length": 51.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3629554510116577, + "epoch": 0.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4105416536331177, + "kl": 0.025173306465148926, + "learning_rate": 4.046939918528243e-06, + "loss": -0.1381, + "num_tokens": 1734589.0, + "reward": 0.21250000596046448, + "reward_std": 0.519790768623352, + "rewards/reward_func/mean": 0.21250000596046448, + "rewards/reward_func/std": 0.48130035400390625, + "sampling/importance_sampling_ratio/max": 1.744605541229248, + "sampling/importance_sampling_ratio/mean": 1.0676430463790894, + "sampling/importance_sampling_ratio/min": 0.6928181052207947, + "sampling/sampling_logp_difference/max": 0.3814241886138916, + "sampling/sampling_logp_difference/mean": 0.025705184787511826, + "step": 310, + "step_time": 40.15090221299033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.32077813148498535, + "epoch": 0.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.09201979637146, + "kl": 0.04986204952001572, + "learning_rate": 4.040571164002319e-06, + "loss": -0.5408, + "num_tokens": 1740635.0, + "reward": 0.1899999976158142, + "reward_std": 0.5336418151855469, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.49503248929977417, + "sampling/importance_sampling_ratio/max": 2.502277374267578, + "sampling/importance_sampling_ratio/mean": 1.2795238494873047, + "sampling/importance_sampling_ratio/min": 0.3709660768508911, + "sampling/sampling_logp_difference/max": 1.0115962028503418, + "sampling/sampling_logp_difference/mean": 0.027241632342338562, + "step": 311, + "step_time": 42.1582357169973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3752894401550293, + "epoch": 0.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7901284694671631, + "kl": 0.04521573334932327, + "learning_rate": 4.034186249613869e-06, + "loss": 0.1113, + "num_tokens": 1746240.0, + "reward": 0.07625000923871994, + "reward_std": 0.2738334536552429, + "rewards/reward_func/mean": 0.07625000923871994, + "rewards/reward_func/std": 0.36629176139831543, + "sampling/importance_sampling_ratio/max": 1.678858757019043, + "sampling/importance_sampling_ratio/mean": 0.8888430595397949, + "sampling/importance_sampling_ratio/min": 0.30794695019721985, + "sampling/sampling_logp_difference/max": 0.8799378871917725, + "sampling/sampling_logp_difference/mean": 0.026124432682991028, + "step": 312, + "step_time": 39.03588431800017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.33964860439300537, + "epoch": 0.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9753314852714539, + "kl": 0.018361952155828476, + "learning_rate": 4.027785242337626e-06, + "loss": -0.0936, + "num_tokens": 1751664.0, + "reward": 0.3450000286102295, + "reward_std": 0.5599009394645691, + "rewards/reward_func/mean": 0.3450000286102295, + "rewards/reward_func/std": 0.5376669764518738, + "sampling/importance_sampling_ratio/max": 1.7452173233032227, + "sampling/importance_sampling_ratio/mean": 0.9294567704200745, + "sampling/importance_sampling_ratio/min": 0.40640538930892944, + "sampling/sampling_logp_difference/max": 0.7474876046180725, + "sampling/sampling_logp_difference/mean": 0.02840811386704445, + "step": 313, + "step_time": 35.24361865199171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.31101056933403015, + "epoch": 0.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8589064478874207, + "kl": 0.028528966009616852, + "learning_rate": 4.021368209317126e-06, + "loss": -0.0071, + "num_tokens": 1756602.0, + "reward": 0.33250001072883606, + "reward_std": 0.5354459285736084, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.5180389285087585, + "sampling/importance_sampling_ratio/max": 1.1891282796859741, + "sampling/importance_sampling_ratio/mean": 0.6778733730316162, + "sampling/importance_sampling_ratio/min": 0.2918332517147064, + "sampling/sampling_logp_difference/max": 1.3020625114440918, + "sampling/sampling_logp_difference/mean": 0.02469920739531517, + "step": 314, + "step_time": 29.316569301998243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 55.25, + "completions/mean_terminated_length": 55.25, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3299601674079895, + "epoch": 0.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8943186402320862, + "kl": 0.02374444343149662, + "learning_rate": 4.014935217864009e-06, + "loss": 0.125, + "num_tokens": 1762205.0, + "reward": 0.2212499976158142, + "reward_std": 0.5198818445205688, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.4814394414424896, + "sampling/importance_sampling_ratio/max": 1.4114490747451782, + "sampling/importance_sampling_ratio/mean": 0.8938862085342407, + "sampling/importance_sampling_ratio/min": 0.5008694529533386, + "sampling/sampling_logp_difference/max": 0.34326255321502686, + "sampling/sampling_logp_difference/mean": 0.020477421581745148, + "step": 315, + "step_time": 41.47385054100596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 57.625, + "completions/mean_terminated_length": 57.625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3791668713092804, + "epoch": 0.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6989040374755859, + "kl": 0.031720541417598724, + "learning_rate": 4.008486335457312e-06, + "loss": -0.2414, + "num_tokens": 1767759.0, + "reward": 0.07500000298023224, + "reward_std": 0.30030137300491333, + "rewards/reward_func/mean": 0.07500000298023224, + "rewards/reward_func/std": 0.37826672196388245, + "sampling/importance_sampling_ratio/max": 2.765488862991333, + "sampling/importance_sampling_ratio/mean": 1.0680071115493774, + "sampling/importance_sampling_ratio/min": 0.15137039124965668, + "sampling/sampling_logp_difference/max": 0.9639625549316406, + "sampling/sampling_logp_difference/mean": 0.028898822143673897, + "step": 316, + "step_time": 46.809593726997264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3248460590839386, + "epoch": 0.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9084358215332031, + "kl": 0.021403685212135315, + "learning_rate": 4.002021629742759e-06, + "loss": -0.0513, + "num_tokens": 1773686.0, + "reward": 0.21000000834465027, + "reward_std": 0.32260364294052124, + "rewards/reward_func/mean": 0.21000000834465027, + "rewards/reward_func/std": 0.4816637933254242, + "sampling/importance_sampling_ratio/max": 1.897550106048584, + "sampling/importance_sampling_ratio/mean": 1.1017444133758545, + "sampling/importance_sampling_ratio/min": 0.3599690794944763, + "sampling/sampling_logp_difference/max": 0.6600342988967896, + "sampling/sampling_logp_difference/mean": 0.019616486504673958, + "step": 317, + "step_time": 44.18835420500545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 55.25, + "completions/mean_terminated_length": 55.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.337682843208313, + "epoch": 0.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7857117652893066, + "kl": 0.021645016968250275, + "learning_rate": 3.995541168532055e-06, + "loss": -0.3915, + "num_tokens": 1779077.0, + "reward": 0.1925000101327896, + "reward_std": 0.3348638415336609, + "rewards/reward_func/mean": 0.1925000101327896, + "rewards/reward_func/std": 0.4793075621128082, + "sampling/importance_sampling_ratio/max": 1.4211463928222656, + "sampling/importance_sampling_ratio/mean": 0.8854165077209473, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6028759479522705, + "sampling/sampling_logp_difference/mean": 0.024257110431790352, + "step": 318, + "step_time": 39.01218066600268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 60.5, + "completions/mean_terminated_length": 60.5, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.38314715027809143, + "epoch": 0.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7992427945137024, + "kl": 0.02151847444474697, + "learning_rate": 3.989045019802171e-06, + "loss": 0.108, + "num_tokens": 1785372.0, + "reward": 0.07500000298023224, + "reward_std": 0.2610323429107666, + "rewards/reward_func/mean": 0.07500000298023224, + "rewards/reward_func/std": 0.3674623668193817, + "sampling/importance_sampling_ratio/max": 1.8030726909637451, + "sampling/importance_sampling_ratio/mean": 1.1737439632415771, + "sampling/importance_sampling_ratio/min": 0.739587664604187, + "sampling/sampling_logp_difference/max": 0.7126893997192383, + "sampling/sampling_logp_difference/mean": 0.02029246836900711, + "step": 319, + "step_time": 43.12463011700311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.332579642534256, + "epoch": 0.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3053195476531982, + "kl": 0.724174439907074, + "learning_rate": 3.982533251694632e-06, + "loss": -0.22, + "num_tokens": 1791652.0, + "reward": 0.07124999910593033, + "reward_std": 0.2992333769798279, + "rewards/reward_func/mean": 0.07124999910593033, + "rewards/reward_func/std": 0.3789623975753784, + "sampling/importance_sampling_ratio/max": 1.868449330329895, + "sampling/importance_sampling_ratio/mean": 0.8420298099517822, + "sampling/importance_sampling_ratio/min": 0.3771025538444519, + "sampling/sampling_logp_difference/max": 1.4212937355041504, + "sampling/sampling_logp_difference/mean": 0.026473678648471832, + "step": 320, + "step_time": 53.79963376899832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.31947869062423706, + "epoch": 0.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9473581314086914, + "kl": 0.01975640282034874, + "learning_rate": 3.976005932514807e-06, + "loss": -0.0296, + "num_tokens": 1796652.0, + "reward": -0.051249995827674866, + "reward_std": 0.0348023921251297, + "rewards/reward_func/mean": -0.051249995827674866, + "rewards/reward_func/std": 0.04642582684755325, + "sampling/importance_sampling_ratio/max": 1.4474753141403198, + "sampling/importance_sampling_ratio/mean": 0.9341865181922913, + "sampling/importance_sampling_ratio/min": 0.4438150227069855, + "sampling/sampling_logp_difference/max": 0.5270947217941284, + "sampling/sampling_logp_difference/mean": 0.026481110602617264, + "step": 321, + "step_time": 31.841147099010414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3830873966217041, + "epoch": 0.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9753480553627014, + "kl": 0.01734742894768715, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0279, + "num_tokens": 1802874.0, + "reward": 0.1899999976158142, + "reward_std": 0.5408111810684204, + "rewards/reward_func/mean": 0.1899999976158142, + "rewards/reward_func/std": 0.5016544461250305, + "sampling/importance_sampling_ratio/max": 1.7382115125656128, + "sampling/importance_sampling_ratio/mean": 0.9483833909034729, + "sampling/importance_sampling_ratio/min": 0.5650532245635986, + "sampling/sampling_logp_difference/max": 0.4500439167022705, + "sampling/sampling_logp_difference/mean": 0.024530794471502304, + "step": 322, + "step_time": 40.43267983599799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 50.875, + "completions/mean_terminated_length": 50.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3294594883918762, + "epoch": 0.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8316826820373535, + "kl": 0.01684340089559555, + "learning_rate": 3.962904914974656e-06, + "loss": -0.0565, + "num_tokens": 1808418.0, + "reward": 0.19499999284744263, + "reward_std": 0.33696448802948, + "rewards/reward_func/mean": 0.19499999284744263, + "rewards/reward_func/std": 0.4996570646762848, + "sampling/importance_sampling_ratio/max": 1.5402275323867798, + "sampling/importance_sampling_ratio/mean": 0.6515508890151978, + "sampling/importance_sampling_ratio/min": 0.23761314153671265, + "sampling/sampling_logp_difference/max": 0.7509863376617432, + "sampling/sampling_logp_difference/mean": 0.02614228054881096, + "step": 323, + "step_time": 36.79304045200115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 53.5, + "completions/mean_terminated_length": 53.5, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.33004504442214966, + "epoch": 0.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1432764530181885, + "kl": 0.02042960189282894, + "learning_rate": 3.956331354037805e-06, + "loss": -0.1958, + "num_tokens": 1813504.0, + "reward": 0.20375001430511475, + "reward_std": 0.32482287287712097, + "rewards/reward_func/mean": 0.20375001430511475, + "rewards/reward_func/std": 0.4935567080974579, + "sampling/importance_sampling_ratio/max": 2.893115758895874, + "sampling/importance_sampling_ratio/mean": 1.2178199291229248, + "sampling/importance_sampling_ratio/min": 0.6653203368186951, + "sampling/sampling_logp_difference/max": 0.5842078924179077, + "sampling/sampling_logp_difference/mean": 0.021406445652246475, + "step": 324, + "step_time": 28.069678256011684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.2968294024467468, + "epoch": 0.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0796535015106201, + "kl": 0.015129530802369118, + "learning_rate": 3.949742516874175e-06, + "loss": 0.0111, + "num_tokens": 1819379.0, + "reward": 0.07000000029802322, + "reward_std": 0.260448157787323, + "rewards/reward_func/mean": 0.07000000029802322, + "rewards/reward_func/std": 0.3445908725261688, + "sampling/importance_sampling_ratio/max": 1.8804163932800293, + "sampling/importance_sampling_ratio/mean": 1.0742685794830322, + "sampling/importance_sampling_ratio/min": 0.46257898211479187, + "sampling/sampling_logp_difference/max": 0.3363761901855469, + "sampling/sampling_logp_difference/mean": 0.019979190081357956, + "step": 325, + "step_time": 41.76114026000141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 57.875, + "completions/mean_terminated_length": 57.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3415643870830536, + "epoch": 0.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1616402864456177, + "kl": 0.011725490912795067, + "learning_rate": 3.943138472597549e-06, + "loss": 0.0107, + "num_tokens": 1824630.0, + "reward": 0.5975000262260437, + "reward_std": 0.5574594736099243, + "rewards/reward_func/mean": 0.5975000262260437, + "rewards/reward_func/std": 0.5363035202026367, + "sampling/importance_sampling_ratio/max": 1.4799103736877441, + "sampling/importance_sampling_ratio/mean": 1.0802059173583984, + "sampling/importance_sampling_ratio/min": 0.49768996238708496, + "sampling/sampling_logp_difference/max": 0.31270480155944824, + "sampling/sampling_logp_difference/mean": 0.022173818200826645, + "step": 326, + "step_time": 22.385102098996867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.32594597339630127, + "epoch": 0.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9298455119132996, + "kl": 0.013603993691504002, + "learning_rate": 3.936519290481226e-06, + "loss": -0.1398, + "num_tokens": 1830458.0, + "reward": -0.04500000178813934, + "reward_std": 0.03829461336135864, + "rewards/reward_func/mean": -0.04500000178813934, + "rewards/reward_func/std": 0.037032805383205414, + "sampling/importance_sampling_ratio/max": 1.7700729370117188, + "sampling/importance_sampling_ratio/mean": 0.8740437030792236, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.3294541835784912, + "sampling/sampling_logp_difference/mean": 0.022188276052474976, + "step": 327, + "step_time": 40.74412981000205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 60.5, + "completions/mean_terminated_length": 60.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.344268262386322, + "epoch": 0.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0750876665115356, + "kl": 0.028728434816002846, + "learning_rate": 3.929885039957296e-06, + "loss": 0.1486, + "num_tokens": 1835664.0, + "reward": 0.19750000536441803, + "reward_std": 0.5264096856117249, + "rewards/reward_func/mean": 0.19750000536441803, + "rewards/reward_func/std": 0.4876694083213806, + "sampling/importance_sampling_ratio/max": 2.0909793376922607, + "sampling/importance_sampling_ratio/mean": 0.974539577960968, + "sampling/importance_sampling_ratio/min": 0.5380573868751526, + "sampling/sampling_logp_difference/max": 0.4193446636199951, + "sampling/sampling_logp_difference/mean": 0.023767180740833282, + "step": 328, + "step_time": 44.32392562199675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.2737228572368622, + "epoch": 0.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8460615873336792, + "kl": 0.04131521284580231, + "learning_rate": 3.923235790615907e-06, + "loss": -0.2082, + "num_tokens": 1841050.0, + "reward": 0.06874999403953552, + "reward_std": 0.28597307205200195, + "rewards/reward_func/mean": 0.06874999403953552, + "rewards/reward_func/std": 0.37911316752433777, + "sampling/importance_sampling_ratio/max": 1.7139233350753784, + "sampling/importance_sampling_ratio/mean": 0.9048976898193359, + "sampling/importance_sampling_ratio/min": 0.1554194986820221, + "sampling/sampling_logp_difference/max": 0.7940307855606079, + "sampling/sampling_logp_difference/mean": 0.022876432165503502, + "step": 329, + "step_time": 32.28045204099908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.30464908480644226, + "epoch": 0.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9566115736961365, + "kl": 0.021810725331306458, + "learning_rate": 3.916571612204538e-06, + "loss": -0.1048, + "num_tokens": 1846399.0, + "reward": 0.20374999940395355, + "reward_std": 0.5155032873153687, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.477491557598114, + "sampling/importance_sampling_ratio/max": 1.5081989765167236, + "sampling/importance_sampling_ratio/mean": 0.9097875952720642, + "sampling/importance_sampling_ratio/min": 0.47412431240081787, + "sampling/sampling_logp_difference/max": 0.46178531646728516, + "sampling/sampling_logp_difference/mean": 0.02363799698650837, + "step": 330, + "step_time": 32.77630696099368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.29317551851272583, + "epoch": 0.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9641599655151367, + "kl": 0.019616033881902695, + "learning_rate": 3.909892574627267e-06, + "loss": -0.1627, + "num_tokens": 1852264.0, + "reward": 0.1824999898672104, + "reward_std": 0.32395851612091064, + "rewards/reward_func/mean": 0.1824999898672104, + "rewards/reward_func/std": 0.4794565439224243, + "sampling/importance_sampling_ratio/max": 2.4280335903167725, + "sampling/importance_sampling_ratio/mean": 1.0602843761444092, + "sampling/importance_sampling_ratio/min": 0.48485422134399414, + "sampling/sampling_logp_difference/max": 0.336214542388916, + "sampling/sampling_logp_difference/mean": 0.025283973664045334, + "step": 331, + "step_time": 40.58578193899302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35180217027664185, + "epoch": 0.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7843549251556396, + "kl": 0.036747075617313385, + "learning_rate": 3.903198747944037e-06, + "loss": -0.0709, + "num_tokens": 1858008.0, + "reward": 0.08874999731779099, + "reward_std": 0.27637845277786255, + "rewards/reward_func/mean": 0.08874999731779099, + "rewards/reward_func/std": 0.36910849809646606, + "sampling/importance_sampling_ratio/max": 2.3475561141967773, + "sampling/importance_sampling_ratio/mean": 1.002429485321045, + "sampling/importance_sampling_ratio/min": 0.3496679663658142, + "sampling/sampling_logp_difference/max": 0.7366769313812256, + "sampling/sampling_logp_difference/mean": 0.029994945973157883, + "step": 332, + "step_time": 40.87763853299839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3407575488090515, + "epoch": 0.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.428694486618042, + "kl": 0.016911303624510765, + "learning_rate": 3.896490202369924e-06, + "loss": 0.0366, + "num_tokens": 1863181.0, + "reward": 0.48625001311302185, + "reward_std": 0.590417206287384, + "rewards/reward_func/mean": 0.48625001311302185, + "rewards/reward_func/std": 0.5468333959579468, + "sampling/importance_sampling_ratio/max": 1.7516218423843384, + "sampling/importance_sampling_ratio/mean": 1.0637693405151367, + "sampling/importance_sampling_ratio/min": 0.2746276259422302, + "sampling/sampling_logp_difference/max": 1.1909523010253906, + "sampling/sampling_logp_difference/mean": 0.026827020570635796, + "step": 333, + "step_time": 21.48538727000414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3031199276447296, + "epoch": 0.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.957080066204071, + "kl": 0.014239022508263588, + "learning_rate": 3.889767008274396e-06, + "loss": 0.02, + "num_tokens": 1868717.0, + "reward": 0.21875, + "reward_std": 0.29802343249320984, + "rewards/reward_func/mean": 0.21875, + "rewards/reward_func/std": 0.467743456363678, + "sampling/importance_sampling_ratio/max": 1.57808256149292, + "sampling/importance_sampling_ratio/mean": 0.9328022599220276, + "sampling/importance_sampling_ratio/min": 0.6014644503593445, + "sampling/sampling_logp_difference/max": 0.4710826873779297, + "sampling/sampling_logp_difference/mean": 0.019350770860910416, + "step": 334, + "step_time": 38.31029029999627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3382827043533325, + "epoch": 0.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2987167835235596, + "kl": 0.030748853459954262, + "learning_rate": 3.883029236180577e-06, + "loss": 0.4063, + "num_tokens": 1874967.0, + "reward": 0.0925000011920929, + "reward_std": 0.26742854714393616, + "rewards/reward_func/mean": 0.0925000011920929, + "rewards/reward_func/std": 0.3517608642578125, + "sampling/importance_sampling_ratio/max": 2.299546957015991, + "sampling/importance_sampling_ratio/mean": 1.0671589374542236, + "sampling/importance_sampling_ratio/min": 0.40785279870033264, + "sampling/sampling_logp_difference/max": 0.44230735301971436, + "sampling/sampling_logp_difference/mean": 0.023608166724443436, + "step": 335, + "step_time": 48.46738143800758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.31930792331695557, + "epoch": 0.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3972865343093872, + "kl": 0.025625426322221756, + "learning_rate": 3.876276956764509e-06, + "loss": 0.0398, + "num_tokens": 1880164.0, + "reward": 0.19624999165534973, + "reward_std": 0.311894953250885, + "rewards/reward_func/mean": 0.19624999165534973, + "rewards/reward_func/std": 0.47234785556793213, + "sampling/importance_sampling_ratio/max": 1.2718799114227295, + "sampling/importance_sampling_ratio/mean": 0.8168869018554688, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.6838905811309814, + "sampling/sampling_logp_difference/mean": 0.02987261861562729, + "step": 336, + "step_time": 31.000979021002422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 55.875, + "completions/mean_terminated_length": 55.875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.3230094015598297, + "epoch": 0.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0229469537734985, + "kl": 0.015833809971809387, + "learning_rate": 3.869510240854408e-06, + "loss": 0.3128, + "num_tokens": 1885758.0, + "reward": 0.057499997317790985, + "reward_std": 0.27753278613090515, + "rewards/reward_func/mean": 0.057499997317790985, + "rewards/reward_func/std": 0.3586781322956085, + "sampling/importance_sampling_ratio/max": 2.7663161754608154, + "sampling/importance_sampling_ratio/mean": 1.203728199005127, + "sampling/importance_sampling_ratio/min": 0.38395655155181885, + "sampling/sampling_logp_difference/max": 0.44419431686401367, + "sampling/sampling_logp_difference/mean": 0.02450854331254959, + "step": 337, + "step_time": 40.59597251701052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.34267550706863403, + "epoch": 0.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7642379999160767, + "kl": 0.03843656927347183, + "learning_rate": 3.862729159429921e-06, + "loss": -0.2044, + "num_tokens": 1891187.0, + "reward": 0.7300000190734863, + "reward_std": 0.5168682336807251, + "rewards/reward_func/mean": 0.7300000190734863, + "rewards/reward_func/std": 0.47883790731430054, + "sampling/importance_sampling_ratio/max": 1.2588783502578735, + "sampling/importance_sampling_ratio/mean": 0.7454515695571899, + "sampling/importance_sampling_ratio/min": 0.35877272486686707, + "sampling/sampling_logp_difference/max": 0.8279721736907959, + "sampling/sampling_logp_difference/mean": 0.027465185150504112, + "step": 338, + "step_time": 28.354054954994353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3062908947467804, + "epoch": 0.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8934028148651123, + "kl": 0.017625048756599426, + "learning_rate": 3.855933783621384e-06, + "loss": -0.0268, + "num_tokens": 1896594.0, + "reward": 0.03374999761581421, + "reward_std": 0.2926676571369171, + "rewards/reward_func/mean": 0.03374999761581421, + "rewards/reward_func/std": 0.38116130232810974, + "sampling/importance_sampling_ratio/max": 2.695901870727539, + "sampling/importance_sampling_ratio/mean": 1.0782511234283447, + "sampling/importance_sampling_ratio/min": 0.34986376762390137, + "sampling/sampling_logp_difference/max": 1.0531506538391113, + "sampling/sampling_logp_difference/mean": 0.02146495133638382, + "step": 339, + "step_time": 38.300622745009605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 55.0, + "completions/mean_terminated_length": 55.0, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.30942976474761963, + "epoch": 0.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.121816873550415, + "kl": 0.02171071618795395, + "learning_rate": 3.849124184709073e-06, + "loss": -0.0232, + "num_tokens": 1901989.0, + "reward": 0.08500000089406967, + "reward_std": 0.28684449195861816, + "rewards/reward_func/mean": 0.08500000089406967, + "rewards/reward_func/std": 0.37171417474746704, + "sampling/importance_sampling_ratio/max": 1.164718508720398, + "sampling/importance_sampling_ratio/mean": 0.9217618703842163, + "sampling/importance_sampling_ratio/min": 0.47626274824142456, + "sampling/sampling_logp_difference/max": 0.9351463317871094, + "sampling/sampling_logp_difference/mean": 0.021724089980125427, + "step": 340, + "step_time": 41.05157758499263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3553770184516907, + "epoch": 0.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8589447140693665, + "kl": 0.023396966978907585, + "learning_rate": 3.84230043412246e-06, + "loss": 0.1354, + "num_tokens": 1907544.0, + "reward": 0.16500000655651093, + "reward_std": 0.5003730058670044, + "rewards/reward_func/mean": 0.16500000655651093, + "rewards/reward_func/std": 0.46800491213798523, + "sampling/importance_sampling_ratio/max": 1.3289875984191895, + "sampling/importance_sampling_ratio/mean": 0.7921176552772522, + "sampling/importance_sampling_ratio/min": 0.35033443570137024, + "sampling/sampling_logp_difference/max": 0.6369071006774902, + "sampling/sampling_logp_difference/mean": 0.03014998510479927, + "step": 341, + "step_time": 35.57554464499117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3291049003601074, + "epoch": 0.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9054797887802124, + "kl": 0.02581053599715233, + "learning_rate": 3.835462603439458e-06, + "loss": 0.0187, + "num_tokens": 1912578.0, + "reward": 0.3537500202655792, + "reward_std": 0.5527259111404419, + "rewards/reward_func/mean": 0.3537500202655792, + "rewards/reward_func/std": 0.5297961831092834, + "sampling/importance_sampling_ratio/max": 1.383819818496704, + "sampling/importance_sampling_ratio/mean": 0.9456525444984436, + "sampling/importance_sampling_ratio/min": 0.24566781520843506, + "sampling/sampling_logp_difference/max": 1.197718858718872, + "sampling/sampling_logp_difference/mean": 0.025123560801148415, + "step": 342, + "step_time": 33.69786287700117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3272516131401062, + "epoch": 0.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9231871366500854, + "kl": 0.0600452721118927, + "learning_rate": 3.828610764385676e-06, + "loss": 0.1178, + "num_tokens": 1918689.0, + "reward": 0.11375001072883606, + "reward_std": 0.24988916516304016, + "rewards/reward_func/mean": 0.11375001072883606, + "rewards/reward_func/std": 0.34616008400917053, + "sampling/importance_sampling_ratio/max": 1.4086482524871826, + "sampling/importance_sampling_ratio/mean": 0.9599908590316772, + "sampling/importance_sampling_ratio/min": 0.3553762137889862, + "sampling/sampling_logp_difference/max": 0.7473673820495605, + "sampling/sampling_logp_difference/mean": 0.02560514770448208, + "step": 343, + "step_time": 38.41646344099718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3004932999610901, + "epoch": 0.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7889523506164551, + "kl": 0.015588251873850822, + "learning_rate": 3.821744988833664e-06, + "loss": 0.0061, + "num_tokens": 1924332.0, + "reward": 0.3400000035762787, + "reward_std": 0.2622142732143402, + "rewards/reward_func/mean": 0.3400000035762787, + "rewards/reward_func/std": 0.5224120020866394, + "sampling/importance_sampling_ratio/max": 0.943295955657959, + "sampling/importance_sampling_ratio/mean": 0.6444682478904724, + "sampling/importance_sampling_ratio/min": 0.2145964801311493, + "sampling/sampling_logp_difference/max": 0.5309677124023438, + "sampling/sampling_logp_difference/mean": 0.0254978034645319, + "step": 344, + "step_time": 31.367738123008166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3225318193435669, + "epoch": 0.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6011006236076355, + "kl": 0.022544488310813904, + "learning_rate": 3.814865348802157e-06, + "loss": 0.1901, + "num_tokens": 1929199.0, + "reward": 0.22499999403953552, + "reward_std": 0.5106456279754639, + "rewards/reward_func/mean": 0.22499999403953552, + "rewards/reward_func/std": 0.4728938341140747, + "sampling/importance_sampling_ratio/max": 1.5084651708602905, + "sampling/importance_sampling_ratio/mean": 0.8030335903167725, + "sampling/importance_sampling_ratio/min": 0.23240113258361816, + "sampling/sampling_logp_difference/max": 0.6574427485466003, + "sampling/sampling_logp_difference/mean": 0.023013845086097717, + "step": 345, + "step_time": 30.061961209998117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 56.375, + "completions/mean_terminated_length": 56.375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.31706368923187256, + "epoch": 0.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0651155710220337, + "kl": 0.018591083586215973, + "learning_rate": 3.807971916455325e-06, + "loss": 0.2587, + "num_tokens": 1934113.0, + "reward": 0.33500000834465027, + "reward_std": 0.5684312582015991, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5465737581253052, + "sampling/importance_sampling_ratio/max": 2.3241448402404785, + "sampling/importance_sampling_ratio/mean": 1.1834617853164673, + "sampling/importance_sampling_ratio/min": 0.557026743888855, + "sampling/sampling_logp_difference/max": 0.49699926376342773, + "sampling/sampling_logp_difference/mean": 0.019841229543089867, + "step": 346, + "step_time": 29.66081979201408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 60.75, + "completions/mean_terminated_length": 60.75, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.3459808826446533, + "epoch": 0.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9309832453727722, + "kl": 0.017617210745811462, + "learning_rate": 3.8010647641020116e-06, + "loss": -0.013, + "num_tokens": 1939744.0, + "reward": 0.4462500214576721, + "reward_std": 0.5236546397209167, + "rewards/reward_func/mean": 0.4462500214576721, + "rewards/reward_func/std": 0.568580687046051, + "sampling/importance_sampling_ratio/max": 1.3764897584915161, + "sampling/importance_sampling_ratio/mean": 1.0339674949645996, + "sampling/importance_sampling_ratio/min": 0.5544477105140686, + "sampling/sampling_logp_difference/max": 0.5542728900909424, + "sampling/sampling_logp_difference/mean": 0.020336320623755455, + "step": 347, + "step_time": 39.97378583300451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.30096814036369324, + "epoch": 0.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1978667974472046, + "kl": 0.032052673399448395, + "learning_rate": 3.794143964194976e-06, + "loss": 0.2779, + "num_tokens": 1945009.0, + "reward": 0.20250000059604645, + "reward_std": 0.516106367111206, + "rewards/reward_func/mean": 0.20250000059604645, + "rewards/reward_func/std": 0.47805407643318176, + "sampling/importance_sampling_ratio/max": 1.9551745653152466, + "sampling/importance_sampling_ratio/mean": 1.1093769073486328, + "sampling/importance_sampling_ratio/min": 0.2619774639606476, + "sampling/sampling_logp_difference/max": 0.6207488775253296, + "sampling/sampling_logp_difference/mean": 0.025560472160577774, + "step": 348, + "step_time": 35.64131051799632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 58.875, + "completions/mean_terminated_length": 58.875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.3116108775138855, + "epoch": 0.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8886599540710449, + "kl": 0.01983170211315155, + "learning_rate": 3.7872095893301344e-06, + "loss": -0.1341, + "num_tokens": 1950903.0, + "reward": 0.49125000834465027, + "reward_std": 0.4993107318878174, + "rewards/reward_func/mean": 0.49125000834465027, + "rewards/reward_func/std": 0.5333302021026611, + "sampling/importance_sampling_ratio/max": 1.8198500871658325, + "sampling/importance_sampling_ratio/mean": 0.9573653340339661, + "sampling/importance_sampling_ratio/min": 0.5384606122970581, + "sampling/sampling_logp_difference/max": 0.4561774730682373, + "sampling/sampling_logp_difference/mean": 0.01902041584253311, + "step": 349, + "step_time": 37.28226282200194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 54.25, + "completions/mean_terminated_length": 54.25, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.35526877641677856, + "epoch": 0.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0215078592300415, + "kl": 0.01806233450770378, + "learning_rate": 3.7802617122457976e-06, + "loss": 0.0752, + "num_tokens": 1956854.0, + "reward": 0.21375000476837158, + "reward_std": 0.5120877027511597, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.4747913181781769, + "sampling/importance_sampling_ratio/max": 1.565601110458374, + "sampling/importance_sampling_ratio/mean": 1.0413784980773926, + "sampling/importance_sampling_ratio/min": 0.45870745182037354, + "sampling/sampling_logp_difference/max": 0.5673609972000122, + "sampling/sampling_logp_difference/mean": 0.02417255938053131, + "step": 350, + "step_time": 40.4494745760021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.380689412355423, + "epoch": 0.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9029824733734131, + "kl": 0.018018556758761406, + "learning_rate": 3.773300405821908e-06, + "loss": -0.0726, + "num_tokens": 1962725.0, + "reward": 0.3137499988079071, + "reward_std": 0.583433210849762, + "rewards/reward_func/mean": 0.3137499988079071, + "rewards/reward_func/std": 0.5663400888442993, + "sampling/importance_sampling_ratio/max": 1.4209295511245728, + "sampling/importance_sampling_ratio/mean": 0.8946368098258972, + "sampling/importance_sampling_ratio/min": 0.4468167722225189, + "sampling/sampling_logp_difference/max": 0.4564931392669678, + "sampling/sampling_logp_difference/mean": 0.02643968164920807, + "step": 351, + "step_time": 44.57280629100569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 62.0, + "completions/mean_terminated_length": 62.0, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.31801438331604004, + "epoch": 0.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9474357962608337, + "kl": 0.013338024728000164, + "learning_rate": 3.766325743079277e-06, + "loss": 0.1357, + "num_tokens": 1967643.0, + "reward": 0.20499999821186066, + "reward_std": 0.32446253299713135, + "rewards/reward_func/mean": 0.20499999821186066, + "rewards/reward_func/std": 0.48314449191093445, + "sampling/importance_sampling_ratio/max": 1.5313472747802734, + "sampling/importance_sampling_ratio/mean": 1.1058592796325684, + "sampling/importance_sampling_ratio/min": 0.44263550639152527, + "sampling/sampling_logp_difference/max": 0.45990777015686035, + "sampling/sampling_logp_difference/mean": 0.020315904170274734, + "step": 352, + "step_time": 32.50577549599984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 56.625, + "completions/mean_terminated_length": 56.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.36904579401016235, + "epoch": 0.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6299339532852173, + "kl": 0.05941528081893921, + "learning_rate": 3.7593377971788162e-06, + "loss": -0.0727, + "num_tokens": 1972573.0, + "reward": 0.3474999964237213, + "reward_std": 0.5575968027114868, + "rewards/reward_func/mean": 0.3474999964237213, + "rewards/reward_func/std": 0.5384302735328674, + "sampling/importance_sampling_ratio/max": 2.091653823852539, + "sampling/importance_sampling_ratio/mean": 1.1020057201385498, + "sampling/importance_sampling_ratio/min": 0.5405357480049133, + "sampling/sampling_logp_difference/max": 0.8576881885528564, + "sampling/sampling_logp_difference/mean": 0.03374676778912544, + "step": 353, + "step_time": 30.564295716001652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 53.5, + "completions/mean_terminated_length": 53.5, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3194846212863922, + "epoch": 0.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9248061776161194, + "kl": 0.012258417904376984, + "learning_rate": 3.752336641420772e-06, + "loss": -0.0662, + "num_tokens": 1977516.0, + "reward": 0.4700000286102295, + "reward_std": 0.5688798427581787, + "rewards/reward_func/mean": 0.4700000286102295, + "rewards/reward_func/std": 0.5268504619598389, + "sampling/importance_sampling_ratio/max": 1.3270989656448364, + "sampling/importance_sampling_ratio/mean": 1.0121691226959229, + "sampling/importance_sampling_ratio/min": 0.6218995451927185, + "sampling/sampling_logp_difference/max": 0.2814149856567383, + "sampling/sampling_logp_difference/mean": 0.0207376666367054, + "step": 354, + "step_time": 32.0068911069975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.38322630524635315, + "epoch": 0.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0300030708312988, + "kl": 0.030064791440963745, + "learning_rate": 3.7453223492439544e-06, + "loss": -0.1406, + "num_tokens": 1983794.0, + "reward": 0.19500000774860382, + "reward_std": 0.5388761758804321, + "rewards/reward_func/mean": 0.19500000774860382, + "rewards/reward_func/std": 0.4991707503795624, + "sampling/importance_sampling_ratio/max": 1.4179537296295166, + "sampling/importance_sampling_ratio/mean": 0.8694342374801636, + "sampling/importance_sampling_ratio/min": 0.34229519963264465, + "sampling/sampling_logp_difference/max": 0.7407898902893066, + "sampling/sampling_logp_difference/mean": 0.028531817719340324, + "step": 355, + "step_time": 37.6383990119939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.30376651883125305, + "epoch": 0.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2527189254760742, + "kl": 0.0377352349460125, + "learning_rate": 3.7382949942249695e-06, + "loss": 0.2867, + "num_tokens": 1989118.0, + "reward": 0.1837500035762787, + "reward_std": 0.5111405253410339, + "rewards/reward_func/mean": 0.1837500035762787, + "rewards/reward_func/std": 0.4737672209739685, + "sampling/importance_sampling_ratio/max": 2.962144136428833, + "sampling/importance_sampling_ratio/mean": 1.4776464700698853, + "sampling/importance_sampling_ratio/min": 0.5036495923995972, + "sampling/sampling_logp_difference/max": 0.7229299545288086, + "sampling/sampling_logp_difference/mean": 0.01985524222254753, + "step": 356, + "step_time": 36.960345953993965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 56.625, + "completions/mean_terminated_length": 56.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.31242305040359497, + "epoch": 0.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8390851020812988, + "kl": 0.11288845539093018, + "learning_rate": 3.731254650077446e-06, + "loss": -0.0201, + "num_tokens": 1994550.0, + "reward": 0.3174999952316284, + "reward_std": 0.5620428323745728, + "rewards/reward_func/mean": 0.3174999952316284, + "rewards/reward_func/std": 0.5362235903739929, + "sampling/importance_sampling_ratio/max": 2.0678493976593018, + "sampling/importance_sampling_ratio/mean": 1.0567212104797363, + "sampling/importance_sampling_ratio/min": 0.6045528650283813, + "sampling/sampling_logp_difference/max": 0.8093851804733276, + "sampling/sampling_logp_difference/mean": 0.02113654837012291, + "step": 357, + "step_time": 34.96656133400393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3537905514240265, + "epoch": 0.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3669836521148682, + "kl": 0.07361885160207748, + "learning_rate": 3.724201390651263e-06, + "loss": 0.1908, + "num_tokens": 2000048.0, + "reward": 0.1937500238418579, + "reward_std": 0.314365953207016, + "rewards/reward_func/mean": 0.1937500238418579, + "rewards/reward_func/std": 0.49517494440078735, + "sampling/importance_sampling_ratio/max": 1.4376270771026611, + "sampling/importance_sampling_ratio/mean": 0.7732110023498535, + "sampling/importance_sampling_ratio/min": 0.14205169677734375, + "sampling/sampling_logp_difference/max": 1.1421258449554443, + "sampling/sampling_logp_difference/mean": 0.029162388294935226, + "step": 358, + "step_time": 42.63583009800641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.38291653990745544, + "epoch": 0.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0044505596160889, + "kl": 0.022494001314044, + "learning_rate": 3.7171352899317743e-06, + "loss": -0.1117, + "num_tokens": 2006426.0, + "reward": -0.05999999865889549, + "reward_std": 0.048807330429553986, + "rewards/reward_func/mean": -0.05999999865889549, + "rewards/reward_func/std": 0.05014265328645706, + "sampling/importance_sampling_ratio/max": 1.710400104522705, + "sampling/importance_sampling_ratio/mean": 1.183423638343811, + "sampling/importance_sampling_ratio/min": 0.698122501373291, + "sampling/sampling_logp_difference/max": 0.641355037689209, + "sampling/sampling_logp_difference/mean": 0.02460392192006111, + "step": 359, + "step_time": 50.55940616400039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3306000530719757, + "epoch": 0.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.998912513256073, + "kl": 0.020860590040683746, + "learning_rate": 3.710056422039033e-06, + "loss": -0.1409, + "num_tokens": 2012069.0, + "reward": 0.22500000894069672, + "reward_std": 0.5172683000564575, + "rewards/reward_func/mean": 0.22500000894069672, + "rewards/reward_func/std": 0.4788975417613983, + "sampling/importance_sampling_ratio/max": 1.3302267789840698, + "sampling/importance_sampling_ratio/mean": 0.8430871963500977, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.8340984582901001, + "sampling/sampling_logp_difference/mean": 0.02408684231340885, + "step": 360, + "step_time": 27.43443747400306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.34160998463630676, + "epoch": 0.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9686026573181152, + "kl": 0.014279250055551529, + "learning_rate": 3.702964861227013e-06, + "loss": 0.0154, + "num_tokens": 2017315.0, + "reward": 0.09125000238418579, + "reward_std": 0.2781270742416382, + "rewards/reward_func/mean": 0.09125000238418579, + "rewards/reward_func/std": 0.3651394248008728, + "sampling/importance_sampling_ratio/max": 1.65193510055542, + "sampling/importance_sampling_ratio/mean": 0.9402889013290405, + "sampling/importance_sampling_ratio/min": 0.45469439029693604, + "sampling/sampling_logp_difference/max": 0.31305837631225586, + "sampling/sampling_logp_difference/mean": 0.023152697831392288, + "step": 361, + "step_time": 39.433918608003296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.36318618059158325, + "epoch": 0.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7241699695587158, + "kl": 0.017520280554890633, + "learning_rate": 3.695860681882832e-06, + "loss": 0.0466, + "num_tokens": 2023159.0, + "reward": 0.07249999791383743, + "reward_std": 0.2886144816875458, + "rewards/reward_func/mean": 0.07249999791383743, + "rewards/reward_func/std": 0.38074177503585815, + "sampling/importance_sampling_ratio/max": 1.5708813667297363, + "sampling/importance_sampling_ratio/mean": 0.8219673037528992, + "sampling/importance_sampling_ratio/min": 0.18283437192440033, + "sampling/sampling_logp_difference/max": 0.6703026294708252, + "sampling/sampling_logp_difference/mean": 0.022826572880148888, + "step": 362, + "step_time": 46.71442539000418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.625, + "completions/mean_terminated_length": 50.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3641355633735657, + "epoch": 0.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7925549149513245, + "kl": 0.019471367821097374, + "learning_rate": 3.6887439585259693e-06, + "loss": 0.1953, + "num_tokens": 2028306.0, + "reward": 0.2175000011920929, + "reward_std": 0.512782871723175, + "rewards/reward_func/mean": 0.2175000011920929, + "rewards/reward_func/std": 0.4749361276626587, + "sampling/importance_sampling_ratio/max": 1.7815968990325928, + "sampling/importance_sampling_ratio/mean": 1.0899240970611572, + "sampling/importance_sampling_ratio/min": 0.39993321895599365, + "sampling/sampling_logp_difference/max": 0.7556244134902954, + "sampling/sampling_logp_difference/mean": 0.02318955399096012, + "step": 363, + "step_time": 33.87657535800827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 56.5, + "completions/mean_terminated_length": 56.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3745487332344055, + "epoch": 0.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9719190001487732, + "kl": 0.01766936480998993, + "learning_rate": 3.6816147658074864e-06, + "loss": -0.0714, + "num_tokens": 2033664.0, + "reward": 0.22875000536441803, + "reward_std": 0.3135777711868286, + "rewards/reward_func/mean": 0.22875000536441803, + "rewards/reward_func/std": 0.47726717591285706, + "sampling/importance_sampling_ratio/max": 1.5119142532348633, + "sampling/importance_sampling_ratio/mean": 0.9923655986785889, + "sampling/importance_sampling_ratio/min": 0.5845767259597778, + "sampling/sampling_logp_difference/max": 0.570970892906189, + "sampling/sampling_logp_difference/mean": 0.024401342496275902, + "step": 364, + "step_time": 39.12082826299593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 60.25, + "completions/mean_terminated_length": 60.25, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.386588454246521, + "epoch": 0.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0872100591659546, + "kl": 0.018672293052077293, + "learning_rate": 3.6744731785092396e-06, + "loss": -0.0248, + "num_tokens": 2038952.0, + "reward": 0.20375001430511475, + "reward_std": 0.32467734813690186, + "rewards/reward_func/mean": 0.20375001430511475, + "rewards/reward_func/std": 0.4832313358783722, + "sampling/importance_sampling_ratio/max": 1.5475658178329468, + "sampling/importance_sampling_ratio/mean": 1.1252450942993164, + "sampling/importance_sampling_ratio/min": 0.6070107817649841, + "sampling/sampling_logp_difference/max": 0.6747951507568359, + "sampling/sampling_logp_difference/mean": 0.024049527943134308, + "step": 365, + "step_time": 36.08769363799365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.32434672117233276, + "epoch": 0.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1058688163757324, + "kl": 0.022890109568834305, + "learning_rate": 3.6673192715431016e-06, + "loss": -0.1808, + "num_tokens": 2044744.0, + "reward": 0.3387500047683716, + "reward_std": 0.5674425363540649, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5432820916175842, + "sampling/importance_sampling_ratio/max": 1.6937315464019775, + "sampling/importance_sampling_ratio/mean": 0.853196382522583, + "sampling/importance_sampling_ratio/min": 0.4900035858154297, + "sampling/sampling_logp_difference/max": 0.5789165496826172, + "sampling/sampling_logp_difference/mean": 0.02218322455883026, + "step": 366, + "step_time": 34.316845288994955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3331367075443268, + "epoch": 0.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9906635880470276, + "kl": 0.01753188669681549, + "learning_rate": 3.6601531199501715e-06, + "loss": -0.0365, + "num_tokens": 2050626.0, + "reward": 0.2150000035762787, + "reward_std": 0.3200206756591797, + "rewards/reward_func/mean": 0.2150000035762787, + "rewards/reward_func/std": 0.4824047088623047, + "sampling/importance_sampling_ratio/max": 1.387757658958435, + "sampling/importance_sampling_ratio/mean": 1.0535778999328613, + "sampling/importance_sampling_ratio/min": 0.5726510882377625, + "sampling/sampling_logp_difference/max": 0.5058789253234863, + "sampling/sampling_logp_difference/mean": 0.018796022981405258, + "step": 367, + "step_time": 43.17819735400553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.38309377431869507, + "epoch": 0.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1113532781600952, + "kl": 0.020980726927518845, + "learning_rate": 3.652974798899988e-06, + "loss": -0.0142, + "num_tokens": 2056725.0, + "reward": 0.09749999642372131, + "reward_std": 0.26799049973487854, + "rewards/reward_func/mean": 0.09749999642372131, + "rewards/reward_func/std": 0.35784077644348145, + "sampling/importance_sampling_ratio/max": 2.1293060779571533, + "sampling/importance_sampling_ratio/mean": 1.0104879140853882, + "sampling/importance_sampling_ratio/min": 0.22460012137889862, + "sampling/sampling_logp_difference/max": 0.6540035009384155, + "sampling/sampling_logp_difference/mean": 0.028138641268014908, + "step": 368, + "step_time": 43.222731264002505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.32224565744400024, + "epoch": 0.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3402920961380005, + "kl": 0.012260911986231804, + "learning_rate": 3.645784383689742e-06, + "loss": 0.1681, + "num_tokens": 2061716.0, + "reward": 0.32625001668930054, + "reward_std": 0.5579476356506348, + "rewards/reward_func/mean": 0.32625001668930054, + "rewards/reward_func/std": 0.537532389163971, + "sampling/importance_sampling_ratio/max": 1.767604947090149, + "sampling/importance_sampling_ratio/mean": 0.9820557236671448, + "sampling/importance_sampling_ratio/min": 0.3409233093261719, + "sampling/sampling_logp_difference/max": 0.38132715225219727, + "sampling/sampling_logp_difference/mean": 0.024098357185721397, + "step": 369, + "step_time": 35.26750406099018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 62.875, + "completions/mean_terminated_length": 62.875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.3785794973373413, + "epoch": 0.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0662072896957397, + "kl": 0.037193797528743744, + "learning_rate": 3.6385819497434877e-06, + "loss": -0.1421, + "num_tokens": 2066831.0, + "reward": 0.45249998569488525, + "reward_std": 0.6269056797027588, + "rewards/reward_func/mean": 0.45249998569488525, + "rewards/reward_func/std": 0.5811749696731567, + "sampling/importance_sampling_ratio/max": 1.6641879081726074, + "sampling/importance_sampling_ratio/mean": 0.9713910222053528, + "sampling/importance_sampling_ratio/min": 0.28194481134414673, + "sampling/sampling_logp_difference/max": 0.9472329616546631, + "sampling/sampling_logp_difference/mean": 0.025759253650903702, + "step": 370, + "step_time": 25.88203328898817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 48.875, + "completions/mean_terminated_length": 48.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3221362233161926, + "epoch": 0.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9221799969673157, + "kl": 0.030139964073896408, + "learning_rate": 3.631367572611348e-06, + "loss": 0.0918, + "num_tokens": 2073296.0, + "reward": 0.08624999970197678, + "reward_std": 0.28050848841667175, + "rewards/reward_func/mean": 0.08624999970197678, + "rewards/reward_func/std": 0.3702484667301178, + "sampling/importance_sampling_ratio/max": 1.8722600936889648, + "sampling/importance_sampling_ratio/mean": 0.9809074401855469, + "sampling/importance_sampling_ratio/min": 0.40770983695983887, + "sampling/sampling_logp_difference/max": 0.6183086037635803, + "sampling/sampling_logp_difference/mean": 0.02641558088362217, + "step": 371, + "step_time": 48.31357828999171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.35156160593032837, + "epoch": 0.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.759164035320282, + "kl": 0.023832060396671295, + "learning_rate": 3.6241413279687256e-06, + "loss": -0.0865, + "num_tokens": 2079272.0, + "reward": 0.3449999988079071, + "reward_std": 0.5492082238197327, + "rewards/reward_func/mean": 0.3449999988079071, + "rewards/reward_func/std": 0.5293661952018738, + "sampling/importance_sampling_ratio/max": 1.4884122610092163, + "sampling/importance_sampling_ratio/mean": 0.8230412006378174, + "sampling/importance_sampling_ratio/min": 0.43746134638786316, + "sampling/sampling_logp_difference/max": 0.5400235652923584, + "sampling/sampling_logp_difference/mean": 0.02419007569551468, + "step": 372, + "step_time": 45.951615555008175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 58.5, + "completions/mean_terminated_length": 58.5, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.31760358810424805, + "epoch": 0.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7639608979225159, + "kl": 0.07498195022344589, + "learning_rate": 3.616903291615506e-06, + "loss": -0.0004, + "num_tokens": 2084366.0, + "reward": 0.3387500047683716, + "reward_std": 0.5394263863563538, + "rewards/reward_func/mean": 0.3387500047683716, + "rewards/reward_func/std": 0.5155146718025208, + "sampling/importance_sampling_ratio/max": 1.6529487371444702, + "sampling/importance_sampling_ratio/mean": 0.9317770004272461, + "sampling/importance_sampling_ratio/min": 0.2725166082382202, + "sampling/sampling_logp_difference/max": 1.0274620056152344, + "sampling/sampling_logp_difference/mean": 0.02573678269982338, + "step": 373, + "step_time": 26.18881739700737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 59.0, + "completions/mean_terminated_length": 59.0, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.29528483748435974, + "epoch": 0.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6288770437240601, + "kl": 0.018295394256711006, + "learning_rate": 3.609653539475268e-06, + "loss": -0.0202, + "num_tokens": 2090117.0, + "reward": 0.3174999952316284, + "reward_std": 0.3078264892101288, + "rewards/reward_func/mean": 0.3174999952316284, + "rewards/reward_func/std": 0.5612931847572327, + "sampling/importance_sampling_ratio/max": 1.0934966802597046, + "sampling/importance_sampling_ratio/mean": 0.6602436304092407, + "sampling/importance_sampling_ratio/min": 0.35864314436912537, + "sampling/sampling_logp_difference/max": 0.6380555629730225, + "sampling/sampling_logp_difference/mean": 0.023511648178100586, + "step": 374, + "step_time": 49.95311562300776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 58.75, + "completions/mean_terminated_length": 58.75, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.33941906690597534, + "epoch": 0.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8127002120018005, + "kl": 0.028118256479501724, + "learning_rate": 3.6023921475944795e-06, + "loss": -0.1582, + "num_tokens": 2095578.0, + "reward": 0.2199999988079071, + "reward_std": 0.29089051485061646, + "rewards/reward_func/mean": 0.2199999988079071, + "rewards/reward_func/std": 0.4539981484413147, + "sampling/importance_sampling_ratio/max": 1.3880345821380615, + "sampling/importance_sampling_ratio/mean": 0.8715201020240784, + "sampling/importance_sampling_ratio/min": 0.11366145312786102, + "sampling/sampling_logp_difference/max": 0.8331606388092041, + "sampling/sampling_logp_difference/mean": 0.023476149886846542, + "step": 375, + "step_time": 39.132226767003885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3131125569343567, + "epoch": 0.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.068045973777771, + "kl": 0.0240363497287035, + "learning_rate": 3.5951191921417063e-06, + "loss": -0.0651, + "num_tokens": 2100937.0, + "reward": 0.1875, + "reward_std": 0.5449391603469849, + "rewards/reward_func/mean": 0.1875, + "rewards/reward_func/std": 0.5059291124343872, + "sampling/importance_sampling_ratio/max": 1.3649033308029175, + "sampling/importance_sampling_ratio/mean": 0.7642968893051147, + "sampling/importance_sampling_ratio/min": 0.24381500482559204, + "sampling/sampling_logp_difference/max": 0.5894099473953247, + "sampling/sampling_logp_difference/mean": 0.025731489062309265, + "step": 376, + "step_time": 36.06358368200017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 55.625, + "completions/mean_terminated_length": 55.625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.36141708493232727, + "epoch": 0.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1907614469528198, + "kl": 0.018009314313530922, + "learning_rate": 3.5878347494068083e-06, + "loss": 0.3203, + "num_tokens": 2106897.0, + "reward": 0.45250001549720764, + "reward_std": 0.6097580790519714, + "rewards/reward_func/mean": 0.45250001549720764, + "rewards/reward_func/std": 0.5646427273750305, + "sampling/importance_sampling_ratio/max": 1.9138866662979126, + "sampling/importance_sampling_ratio/mean": 1.0367978811264038, + "sampling/importance_sampling_ratio/min": 0.4980725646018982, + "sampling/sampling_logp_difference/max": 0.7002124786376953, + "sampling/sampling_logp_difference/mean": 0.022084344178438187, + "step": 377, + "step_time": 42.03821306199825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.31293684244155884, + "epoch": 0.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9326619505882263, + "kl": 0.024975117295980453, + "learning_rate": 3.580538895800144e-06, + "loss": -0.1336, + "num_tokens": 2112259.0, + "reward": 0.17624999582767487, + "reward_std": 0.3398403823375702, + "rewards/reward_func/mean": 0.17624999582767487, + "rewards/reward_func/std": 0.5064142346382141, + "sampling/importance_sampling_ratio/max": 1.1192891597747803, + "sampling/importance_sampling_ratio/mean": 0.757171630859375, + "sampling/importance_sampling_ratio/min": 0.45731785893440247, + "sampling/sampling_logp_difference/max": 0.5144007205963135, + "sampling/sampling_logp_difference/mean": 0.025648921728134155, + "step": 378, + "step_time": 37.88494844498928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 55.5, + "completions/mean_terminated_length": 55.5, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3656163811683655, + "epoch": 0.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8823372721672058, + "kl": 0.02256789244711399, + "learning_rate": 3.573231707851765e-06, + "loss": 0.2197, + "num_tokens": 2118172.0, + "reward": 0.4424999952316284, + "reward_std": 0.6283525824546814, + "rewards/reward_func/mean": 0.4424999952316284, + "rewards/reward_func/std": 0.5822800397872925, + "sampling/importance_sampling_ratio/max": 1.8701510429382324, + "sampling/importance_sampling_ratio/mean": 0.8111326694488525, + "sampling/importance_sampling_ratio/min": 0.30452919006347656, + "sampling/sampling_logp_difference/max": 0.8101745843887329, + "sampling/sampling_logp_difference/mean": 0.026399342343211174, + "step": 379, + "step_time": 43.92895183300425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3246749937534332, + "epoch": 0.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9643809795379639, + "kl": 0.020442264154553413, + "learning_rate": 3.5659132622106152e-06, + "loss": -0.1417, + "num_tokens": 2123917.0, + "reward": 0.35374999046325684, + "reward_std": 0.5508840680122375, + "rewards/reward_func/mean": 0.35374999046325684, + "rewards/reward_func/std": 0.5303890109062195, + "sampling/importance_sampling_ratio/max": 1.7333760261535645, + "sampling/importance_sampling_ratio/mean": 0.9654305577278137, + "sampling/importance_sampling_ratio/min": 0.37011101841926575, + "sampling/sampling_logp_difference/max": 0.5683209896087646, + "sampling/sampling_logp_difference/mean": 0.02381267584860325, + "step": 380, + "step_time": 32.53779908501019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 56.375, + "completions/mean_terminated_length": 56.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.33849892020225525, + "epoch": 0.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9345517754554749, + "kl": 0.01798607036471367, + "learning_rate": 3.5585836356437266e-06, + "loss": 0.1689, + "num_tokens": 2129155.0, + "reward": 0.4699999988079071, + "reward_std": 0.5834348201751709, + "rewards/reward_func/mean": 0.4699999988079071, + "rewards/reward_func/std": 0.5406873822212219, + "sampling/importance_sampling_ratio/max": 1.434735894203186, + "sampling/importance_sampling_ratio/mean": 0.9351725578308105, + "sampling/importance_sampling_ratio/min": 0.32867270708084106, + "sampling/sampling_logp_difference/max": 0.8937342166900635, + "sampling/sampling_logp_difference/mean": 0.023814164102077484, + "step": 381, + "step_time": 35.22910028499609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.35224515199661255, + "epoch": 0.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69114750623703, + "kl": 0.020070038735866547, + "learning_rate": 3.551242905035412e-06, + "loss": 0.0692, + "num_tokens": 2135558.0, + "reward": 0.08374999463558197, + "reward_std": 0.2716542184352875, + "rewards/reward_func/mean": 0.08374999463558197, + "rewards/reward_func/std": 0.3633549213409424, + "sampling/importance_sampling_ratio/max": 1.3249363899230957, + "sampling/importance_sampling_ratio/mean": 0.8756052255630493, + "sampling/importance_sampling_ratio/min": 0.5583480596542358, + "sampling/sampling_logp_difference/max": 0.3986530303955078, + "sampling/sampling_logp_difference/mean": 0.02286503091454506, + "step": 382, + "step_time": 47.23492028898909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3356274664402008, + "epoch": 0.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9842677116394043, + "kl": 0.03034183755517006, + "learning_rate": 3.5438911473864633e-06, + "loss": 0.1462, + "num_tokens": 2141777.0, + "reward": 0.1912500113248825, + "reward_std": 0.30746302008628845, + "rewards/reward_func/mean": 0.1912500113248825, + "rewards/reward_func/std": 0.47405359148979187, + "sampling/importance_sampling_ratio/max": 1.4171618223190308, + "sampling/importance_sampling_ratio/mean": 0.9373311400413513, + "sampling/importance_sampling_ratio/min": 0.4872235059738159, + "sampling/sampling_logp_difference/max": 0.6592245101928711, + "sampling/sampling_logp_difference/mean": 0.024339091032743454, + "step": 383, + "step_time": 52.436622552995686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3243747651576996, + "epoch": 0.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8072435259819031, + "kl": 0.014891182072460651, + "learning_rate": 3.5365284398133404e-06, + "loss": -0.169, + "num_tokens": 2146987.0, + "reward": 0.3149999976158142, + "reward_std": 0.543981671333313, + "rewards/reward_func/mean": 0.3149999976158142, + "rewards/reward_func/std": 0.5302021503448486, + "sampling/importance_sampling_ratio/max": 1.3961093425750732, + "sampling/importance_sampling_ratio/mean": 0.6877409219741821, + "sampling/importance_sampling_ratio/min": 0.28648021817207336, + "sampling/sampling_logp_difference/max": 0.7951034903526306, + "sampling/sampling_logp_difference/mean": 0.02731979638338089, + "step": 384, + "step_time": 40.187907672996516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3296257555484772, + "epoch": 0.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6858446598052979, + "kl": 0.016891244798898697, + "learning_rate": 3.52915485954736e-06, + "loss": 0.0741, + "num_tokens": 2152302.0, + "reward": 0.04874999076128006, + "reward_std": 0.3078903555870056, + "rewards/reward_func/mean": 0.04874999076128006, + "rewards/reward_func/std": 0.389154314994812, + "sampling/importance_sampling_ratio/max": 1.2545758485794067, + "sampling/importance_sampling_ratio/mean": 0.8542709946632385, + "sampling/importance_sampling_ratio/min": 0.4029653072357178, + "sampling/sampling_logp_difference/max": 0.4015458822250366, + "sampling/sampling_logp_difference/mean": 0.023256313055753708, + "step": 385, + "step_time": 36.99427409100463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 58.0, + "completions/mean_terminated_length": 58.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35816895961761475, + "epoch": 0.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0791374444961548, + "kl": 0.06013531610369682, + "learning_rate": 3.521770483933891e-06, + "loss": 0.1586, + "num_tokens": 2157697.0, + "reward": 0.0625, + "reward_std": 0.28779086470603943, + "rewards/reward_func/mean": 0.0625, + "rewards/reward_func/std": 0.3733152747154236, + "sampling/importance_sampling_ratio/max": 1.4287514686584473, + "sampling/importance_sampling_ratio/mean": 0.8348481059074402, + "sampling/importance_sampling_ratio/min": 0.22553585469722748, + "sampling/sampling_logp_difference/max": 0.8671650886535645, + "sampling/sampling_logp_difference/mean": 0.02565944194793701, + "step": 386, + "step_time": 41.950704916001996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.33600252866744995, + "epoch": 0.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.198814034461975, + "kl": 0.012472853064537048, + "learning_rate": 3.514375390431539e-06, + "loss": 0.2883, + "num_tokens": 2164144.0, + "reward": 0.19749999046325684, + "reward_std": 0.3289427161216736, + "rewards/reward_func/mean": 0.19749999046325684, + "rewards/reward_func/std": 0.4798734784126282, + "sampling/importance_sampling_ratio/max": 1.8541417121887207, + "sampling/importance_sampling_ratio/mean": 0.9226713180541992, + "sampling/importance_sampling_ratio/min": 0.33913132548332214, + "sampling/sampling_logp_difference/max": 0.598806619644165, + "sampling/sampling_logp_difference/mean": 0.021760722622275352, + "step": 387, + "step_time": 44.073393629994825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.4183434247970581, + "epoch": 0.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.299253225326538, + "kl": 0.02365995943546295, + "learning_rate": 3.5069696566113347e-06, + "loss": -0.0277, + "num_tokens": 2169925.0, + "reward": 0.3137500286102295, + "reward_std": 0.5675816535949707, + "rewards/reward_func/mean": 0.3137500286102295, + "rewards/reward_func/std": 0.5541515946388245, + "sampling/importance_sampling_ratio/max": 1.9320272207260132, + "sampling/importance_sampling_ratio/mean": 0.9678875803947449, + "sampling/importance_sampling_ratio/min": 0.4237723648548126, + "sampling/sampling_logp_difference/max": 0.9530621767044067, + "sampling/sampling_logp_difference/mean": 0.028726529330015182, + "step": 388, + "step_time": 36.650615365986596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.4050993025302887, + "epoch": 0.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.352435827255249, + "kl": 0.0169648639857769, + "learning_rate": 3.499553360155923e-06, + "loss": -0.2526, + "num_tokens": 2176095.0, + "reward": 0.06624999642372131, + "reward_std": 0.2910441756248474, + "rewards/reward_func/mean": 0.06624999642372131, + "rewards/reward_func/std": 0.37965914607048035, + "sampling/importance_sampling_ratio/max": 1.629567265510559, + "sampling/importance_sampling_ratio/mean": 0.9991623163223267, + "sampling/importance_sampling_ratio/min": 0.5820651650428772, + "sampling/sampling_logp_difference/max": 0.25946611166000366, + "sampling/sampling_logp_difference/mean": 0.021378565579652786, + "step": 389, + "step_time": 52.00782443599019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.35824936628341675, + "epoch": 0.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9068090319633484, + "kl": 0.027212774381041527, + "learning_rate": 3.4921265788587432e-06, + "loss": -0.0936, + "num_tokens": 2181657.0, + "reward": 0.059999994933605194, + "reward_std": 0.27817416191101074, + "rewards/reward_func/mean": 0.059999994933605194, + "rewards/reward_func/std": 0.35496482253074646, + "sampling/importance_sampling_ratio/max": 2.0222272872924805, + "sampling/importance_sampling_ratio/mean": 0.9855067729949951, + "sampling/importance_sampling_ratio/min": 0.47639888525009155, + "sampling/sampling_logp_difference/max": 0.9331116676330566, + "sampling/sampling_logp_difference/mean": 0.027233093976974487, + "step": 390, + "step_time": 42.25256816399633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.32872098684310913, + "epoch": 0.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7149050235748291, + "kl": 0.044429853558540344, + "learning_rate": 3.484689390623218e-06, + "loss": 0.1788, + "num_tokens": 2187851.0, + "reward": 0.0637499988079071, + "reward_std": 0.2887832820415497, + "rewards/reward_func/mean": 0.0637499988079071, + "rewards/reward_func/std": 0.38029828667640686, + "sampling/importance_sampling_ratio/max": 1.5726252794265747, + "sampling/importance_sampling_ratio/mean": 0.7780969738960266, + "sampling/importance_sampling_ratio/min": 0.21605902910232544, + "sampling/sampling_logp_difference/max": 0.9326303005218506, + "sampling/sampling_logp_difference/mean": 0.02885178104043007, + "step": 391, + "step_time": 44.16586426900176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 58.25, + "completions/mean_terminated_length": 58.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.353057861328125, + "epoch": 0.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6402285695075989, + "kl": 0.047673989087343216, + "learning_rate": 3.4772418734619325e-06, + "loss": -0.1065, + "num_tokens": 2193337.0, + "reward": 0.5874999761581421, + "reward_std": 0.5623499155044556, + "rewards/reward_func/mean": 0.5874999761581421, + "rewards/reward_func/std": 0.5455207228660583, + "sampling/importance_sampling_ratio/max": 1.056589126586914, + "sampling/importance_sampling_ratio/mean": 0.6921772360801697, + "sampling/importance_sampling_ratio/min": 0.2916169762611389, + "sampling/sampling_logp_difference/max": 1.1112589836120605, + "sampling/sampling_logp_difference/mean": 0.024330832064151764, + "step": 392, + "step_time": 37.52057623099245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 56.75, + "completions/mean_terminated_length": 56.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.35051673650741577, + "epoch": 0.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.877177894115448, + "kl": 0.013033976778388023, + "learning_rate": 3.4697841054958163e-06, + "loss": 0.1362, + "num_tokens": 2199559.0, + "reward": 0.1837500035762787, + "reward_std": 0.5407979488372803, + "rewards/reward_func/mean": 0.1837500035762787, + "rewards/reward_func/std": 0.5013107657432556, + "sampling/importance_sampling_ratio/max": 1.3691225051879883, + "sampling/importance_sampling_ratio/mean": 0.940036416053772, + "sampling/importance_sampling_ratio/min": 0.3252532482147217, + "sampling/sampling_logp_difference/max": 0.3541145324707031, + "sampling/sampling_logp_difference/mean": 0.023762091994285583, + "step": 393, + "step_time": 45.31983452399436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.2770605981349945, + "epoch": 0.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6571247577667236, + "kl": 0.027253959327936172, + "learning_rate": 3.4623161649533284e-06, + "loss": -0.0846, + "num_tokens": 2205060.0, + "reward": -0.0625, + "reward_std": 0.05641929805278778, + "rewards/reward_func/mean": -0.0625, + "rewards/reward_func/std": 0.05994044616818428, + "sampling/importance_sampling_ratio/max": 1.2533941268920898, + "sampling/importance_sampling_ratio/mean": 0.7182776927947998, + "sampling/importance_sampling_ratio/min": 0.45736369490623474, + "sampling/sampling_logp_difference/max": 0.8105928897857666, + "sampling/sampling_logp_difference/mean": 0.0246109776198864, + "step": 394, + "step_time": 46.803369777990156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.32860368490219116, + "epoch": 0.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.807575523853302, + "kl": 0.039417944848537445, + "learning_rate": 3.4548381301696298e-06, + "loss": -0.1621, + "num_tokens": 2210663.0, + "reward": 0.09875000268220901, + "reward_std": 0.26192745566368103, + "rewards/reward_func/mean": 0.09875000268220901, + "rewards/reward_func/std": 0.35385382175445557, + "sampling/importance_sampling_ratio/max": 1.6175183057785034, + "sampling/importance_sampling_ratio/mean": 0.8804988861083984, + "sampling/importance_sampling_ratio/min": 0.41070258617401123, + "sampling/sampling_logp_difference/max": 0.7530922889709473, + "sampling/sampling_logp_difference/mean": 0.020096953958272934, + "step": 395, + "step_time": 40.773527258003014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.31244707107543945, + "epoch": 0.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046858310699463, + "kl": 0.014741847291588783, + "learning_rate": 3.4473500795857674e-06, + "loss": 0.1284, + "num_tokens": 2215955.0, + "reward": 0.22750000655651093, + "reward_std": 0.4991302490234375, + "rewards/reward_func/mean": 0.22750000655651093, + "rewards/reward_func/std": 0.46219196915626526, + "sampling/importance_sampling_ratio/max": 2.1469762325286865, + "sampling/importance_sampling_ratio/mean": 1.328494668006897, + "sampling/importance_sampling_ratio/min": 0.5685848593711853, + "sampling/sampling_logp_difference/max": 0.5777333974838257, + "sampling/sampling_logp_difference/mean": 0.022995343431830406, + "step": 396, + "step_time": 37.15637919999426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3469958007335663, + "epoch": 0.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2579119205474854, + "kl": 0.045768074691295624, + "learning_rate": 3.4398520917478478e-06, + "loss": 0.1787, + "num_tokens": 2221464.0, + "reward": 0.3199999928474426, + "reward_std": 0.29569682478904724, + "rewards/reward_func/mean": 0.3199999928474426, + "rewards/reward_func/std": 0.560943067073822, + "sampling/importance_sampling_ratio/max": 1.4082008600234985, + "sampling/importance_sampling_ratio/mean": 0.9037089347839355, + "sampling/importance_sampling_ratio/min": 0.24859501421451569, + "sampling/sampling_logp_difference/max": 1.1122647523880005, + "sampling/sampling_logp_difference/mean": 0.028527939692139626, + "step": 397, + "step_time": 39.99640729100793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 55.375, + "completions/mean_terminated_length": 55.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3531041443347931, + "epoch": 0.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0242723226547241, + "kl": 0.017868993803858757, + "learning_rate": 3.4323442453062173e-06, + "loss": 0.2376, + "num_tokens": 2226383.0, + "reward": 0.5662500262260437, + "reward_std": 0.28834110498428345, + "rewards/reward_func/mean": 0.5662500262260437, + "rewards/reward_func/std": 0.5615269541740417, + "sampling/importance_sampling_ratio/max": 2.4575791358947754, + "sampling/importance_sampling_ratio/mean": 1.1844841241836548, + "sampling/importance_sampling_ratio/min": 0.8696222901344299, + "sampling/sampling_logp_difference/max": 0.48058170080184937, + "sampling/sampling_logp_difference/mean": 0.023350011557340622, + "step": 398, + "step_time": 32.65750913400552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.3498823940753937, + "epoch": 0.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7218272686004639, + "kl": 0.03050382062792778, + "learning_rate": 3.4248266190146307e-06, + "loss": -0.0799, + "num_tokens": 2231912.0, + "reward": 0.3075000047683716, + "reward_std": 0.5727229714393616, + "rewards/reward_func/mean": 0.3075000047683716, + "rewards/reward_func/std": 0.5470113754272461, + "sampling/importance_sampling_ratio/max": 1.0502070188522339, + "sampling/importance_sampling_ratio/mean": 0.6887789964675903, + "sampling/importance_sampling_ratio/min": 0.4914277195930481, + "sampling/sampling_logp_difference/max": 0.7550356388092041, + "sampling/sampling_logp_difference/mean": 0.027827613055706024, + "step": 399, + "step_time": 41.33596688800026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 56.125, + "completions/mean_terminated_length": 56.125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.3193606436252594, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2495148181915283, + "kl": 0.017775503918528557, + "learning_rate": 3.417299291729431e-06, + "loss": -0.0324, + "num_tokens": 2236982.0, + "reward": 0.3462499976158142, + "reward_std": 0.5585935711860657, + "rewards/reward_func/mean": 0.3462499976158142, + "rewards/reward_func/std": 0.5422160029411316, + "sampling/importance_sampling_ratio/max": 2.1713757514953613, + "sampling/importance_sampling_ratio/mean": 1.0706489086151123, + "sampling/importance_sampling_ratio/min": 0.2907283306121826, + "sampling/sampling_logp_difference/max": 1.8012995719909668, + "sampling/sampling_logp_difference/mean": 0.035895854234695435, + "step": 400, + "step_time": 35.204487941999105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3636338710784912, + "epoch": 0.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8902538418769836, + "kl": 0.013676811009645462, + "learning_rate": 3.4097623424087196e-06, + "loss": -0.1516, + "num_tokens": 2242653.0, + "reward": 0.3462499678134918, + "reward_std": 0.26222485303878784, + "rewards/reward_func/mean": 0.3462499678134918, + "rewards/reward_func/std": 0.5304967164993286, + "sampling/importance_sampling_ratio/max": 1.1426626443862915, + "sampling/importance_sampling_ratio/mean": 0.8304384350776672, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.0202627182006836, + "sampling/sampling_logp_difference/mean": 0.029507692903280258, + "step": 401, + "step_time": 40.37125378000201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.36135992407798767, + "epoch": 0.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7169837355613708, + "kl": 0.02164129912853241, + "learning_rate": 3.4022158501115283e-06, + "loss": 0.0952, + "num_tokens": 2248740.0, + "reward": 0.08000000566244125, + "reward_std": 0.2766728699207306, + "rewards/reward_func/mean": 0.08000000566244125, + "rewards/reward_func/std": 0.3709062337875366, + "sampling/importance_sampling_ratio/max": 2.0062737464904785, + "sampling/importance_sampling_ratio/mean": 0.9354610443115234, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.5968524217605591, + "sampling/sampling_logp_difference/mean": 0.02405213564634323, + "step": 402, + "step_time": 40.320551773998886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 55.0, + "completions/mean_terminated_length": 55.0, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3496766686439514, + "epoch": 0.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.099068522453308, + "kl": 0.031258516013622284, + "learning_rate": 3.39465989399699e-06, + "loss": -0.186, + "num_tokens": 2253924.0, + "reward": 0.3424999713897705, + "reward_std": 0.5592527389526367, + "rewards/reward_func/mean": 0.3424999713897705, + "rewards/reward_func/std": 0.5341682434082031, + "sampling/importance_sampling_ratio/max": 1.7832975387573242, + "sampling/importance_sampling_ratio/mean": 0.9368232488632202, + "sampling/importance_sampling_ratio/min": 0.2631537914276123, + "sampling/sampling_logp_difference/max": 1.3949875831604004, + "sampling/sampling_logp_difference/mean": 0.022440284490585327, + "step": 403, + "step_time": 31.128040015988518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.37068814039230347, + "epoch": 0.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0627682209014893, + "kl": 0.12361843883991241, + "learning_rate": 3.3870945533235104e-06, + "loss": 0.3472, + "num_tokens": 2259059.0, + "reward": 0.20624998211860657, + "reward_std": 0.3201104700565338, + "rewards/reward_func/mean": 0.20624998211860657, + "rewards/reward_func/std": 0.47850772738456726, + "sampling/importance_sampling_ratio/max": 1.4040093421936035, + "sampling/importance_sampling_ratio/mean": 0.7943763732910156, + "sampling/importance_sampling_ratio/min": 0.2126779854297638, + "sampling/sampling_logp_difference/max": 1.2342114448547363, + "sampling/sampling_logp_difference/mean": 0.030054152011871338, + "step": 404, + "step_time": 38.50842455399106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.32690513134002686, + "epoch": 0.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.05023193359375, + "kl": 0.017625313252210617, + "learning_rate": 3.3795199074479312e-06, + "loss": 0.2463, + "num_tokens": 2264094.0, + "reward": 0.4337500035762787, + "reward_std": 0.6317378282546997, + "rewards/reward_func/mean": 0.4337500035762787, + "rewards/reward_func/std": 0.5862212777137756, + "sampling/importance_sampling_ratio/max": 1.7540080547332764, + "sampling/importance_sampling_ratio/mean": 1.1875271797180176, + "sampling/importance_sampling_ratio/min": 0.387142539024353, + "sampling/sampling_logp_difference/max": 0.4821145534515381, + "sampling/sampling_logp_difference/mean": 0.02176579087972641, + "step": 405, + "step_time": 30.347279070992954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.342538058757782, + "epoch": 0.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2553136348724365, + "kl": 0.024068983271718025, + "learning_rate": 3.3719360358247054e-06, + "loss": 0.0313, + "num_tokens": 2269524.0, + "reward": 0.5887500047683716, + "reward_std": 0.2793101370334625, + "rewards/reward_func/mean": 0.5887500047683716, + "rewards/reward_func/std": 0.5110895037651062, + "sampling/importance_sampling_ratio/max": 1.5902272462844849, + "sampling/importance_sampling_ratio/mean": 0.9752755165100098, + "sampling/importance_sampling_ratio/min": 0.368798166513443, + "sampling/sampling_logp_difference/max": 0.5483064651489258, + "sampling/sampling_logp_difference/mean": 0.026680167764425278, + "step": 406, + "step_time": 32.252441548000206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.29614022374153137, + "epoch": 0.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7736188769340515, + "kl": 0.012905233539640903, + "learning_rate": 3.3643430180050573e-06, + "loss": 0.119, + "num_tokens": 2275015.0, + "reward": 0.22625000774860382, + "reward_std": 0.5097081661224365, + "rewards/reward_func/mean": 0.22625000774860382, + "rewards/reward_func/std": 0.4719545245170593, + "sampling/importance_sampling_ratio/max": 2.0418217182159424, + "sampling/importance_sampling_ratio/mean": 1.0212173461914062, + "sampling/importance_sampling_ratio/min": 0.137149840593338, + "sampling/sampling_logp_difference/max": 0.8900790214538574, + "sampling/sampling_logp_difference/mean": 0.023198578506708145, + "step": 407, + "step_time": 34.40931955000269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.31210464239120483, + "epoch": 0.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0732399225234985, + "kl": 0.04985180124640465, + "learning_rate": 3.3567409336361502e-06, + "loss": 0.4186, + "num_tokens": 2280251.0, + "reward": 0.36250001192092896, + "reward_std": 0.5389477610588074, + "rewards/reward_func/mean": 0.36250001192092896, + "rewards/reward_func/std": 0.5198007822036743, + "sampling/importance_sampling_ratio/max": 2.857747793197632, + "sampling/importance_sampling_ratio/mean": 1.1543896198272705, + "sampling/importance_sampling_ratio/min": 0.2027559131383896, + "sampling/sampling_logp_difference/max": 0.6502933502197266, + "sampling/sampling_logp_difference/mean": 0.025806117802858353, + "step": 408, + "step_time": 26.751435946003767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.36082959175109863, + "epoch": 0.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7728215456008911, + "kl": 0.017035100609064102, + "learning_rate": 3.3491298624602514e-06, + "loss": -0.0713, + "num_tokens": 2285456.0, + "reward": 0.33375000953674316, + "reward_std": 0.26373493671417236, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5254912972450256, + "sampling/importance_sampling_ratio/max": 2.1468451023101807, + "sampling/importance_sampling_ratio/mean": 1.1487035751342773, + "sampling/importance_sampling_ratio/min": 0.7357592582702637, + "sampling/sampling_logp_difference/max": 0.2946791648864746, + "sampling/sampling_logp_difference/mean": 0.020242050290107727, + "step": 409, + "step_time": 42.09881355499965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.3041061758995056, + "epoch": 0.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2047560214996338, + "kl": 0.03163205087184906, + "learning_rate": 3.3415098843138972e-06, + "loss": 0.0197, + "num_tokens": 2291363.0, + "reward": 0.2212499976158142, + "reward_std": 0.30923277139663696, + "rewards/reward_func/mean": 0.2212499976158142, + "rewards/reward_func/std": 0.46932896971702576, + "sampling/importance_sampling_ratio/max": 1.5919907093048096, + "sampling/importance_sampling_ratio/mean": 0.974951982498169, + "sampling/importance_sampling_ratio/min": 0.3237793445587158, + "sampling/sampling_logp_difference/max": 0.8042126893997192, + "sampling/sampling_logp_difference/mean": 0.026545334607362747, + "step": 410, + "step_time": 36.71569881300093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.33315184712409973, + "epoch": 0.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6845096349716187, + "kl": 0.013183169066905975, + "learning_rate": 3.333881079127052e-06, + "loss": 0.0578, + "num_tokens": 2297000.0, + "reward": 0.21375000476837158, + "reward_std": 0.5123411417007446, + "rewards/reward_func/mean": 0.21375000476837158, + "rewards/reward_func/std": 0.4748815596103668, + "sampling/importance_sampling_ratio/max": 1.133974552154541, + "sampling/importance_sampling_ratio/mean": 0.9385578632354736, + "sampling/importance_sampling_ratio/min": 0.5827104449272156, + "sampling/sampling_logp_difference/max": 0.30054330825805664, + "sampling/sampling_logp_difference/mean": 0.02072637900710106, + "step": 411, + "step_time": 42.78445947699947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.35779449343681335, + "epoch": 0.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8247691988945007, + "kl": 0.012468487024307251, + "learning_rate": 3.326243526922272e-06, + "loss": 0.0478, + "num_tokens": 2302481.0, + "reward": 0.4762499928474426, + "reward_std": 0.021619636565446854, + "rewards/reward_func/mean": 0.4762499928474426, + "rewards/reward_func/std": 0.5523698925971985, + "sampling/importance_sampling_ratio/max": 1.451501488685608, + "sampling/importance_sampling_ratio/mean": 0.9668034911155701, + "sampling/importance_sampling_ratio/min": 0.5821980237960815, + "sampling/sampling_logp_difference/max": 0.33460497856140137, + "sampling/sampling_logp_difference/mean": 0.021230852231383324, + "step": 412, + "step_time": 41.77823870100838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 53.625, + "completions/mean_terminated_length": 53.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.31886088848114014, + "epoch": 0.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7706137895584106, + "kl": 0.01695754937827587, + "learning_rate": 3.3185973078138665e-06, + "loss": 0.2151, + "num_tokens": 2308188.0, + "reward": 0.19249999523162842, + "reward_std": 0.519284188747406, + "rewards/reward_func/mean": 0.19249999523162842, + "rewards/reward_func/std": 0.4808846116065979, + "sampling/importance_sampling_ratio/max": 1.734723687171936, + "sampling/importance_sampling_ratio/mean": 1.0181288719177246, + "sampling/importance_sampling_ratio/min": 0.5788209438323975, + "sampling/sampling_logp_difference/max": 0.39677077531814575, + "sampling/sampling_logp_difference/mean": 0.022095143795013428, + "step": 413, + "step_time": 39.981942358994274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 56.0, + "completions/mean_terminated_length": 56.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.38825303316116333, + "epoch": 0.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2646406888961792, + "kl": 0.011753249913454056, + "learning_rate": 3.3109425020070564e-06, + "loss": -0.0857, + "num_tokens": 2313426.0, + "reward": 0.3412500023841858, + "reward_std": 0.523719072341919, + "rewards/reward_func/mean": 0.3412500023841858, + "rewards/reward_func/std": 0.5026625990867615, + "sampling/importance_sampling_ratio/max": 1.6973400115966797, + "sampling/importance_sampling_ratio/mean": 1.1444008350372314, + "sampling/importance_sampling_ratio/min": 0.6853067874908447, + "sampling/sampling_logp_difference/max": 0.35372257232666016, + "sampling/sampling_logp_difference/mean": 0.02703225240111351, + "step": 414, + "step_time": 34.05183301899524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.31559109687805176, + "epoch": 0.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.001054048538208, + "kl": 0.025500796735286713, + "learning_rate": 3.3032791897971313e-06, + "loss": -0.0043, + "num_tokens": 2318855.0, + "reward": 0.4137499928474426, + "reward_std": 0.5379934906959534, + "rewards/reward_func/mean": 0.4137499928474426, + "rewards/reward_func/std": 0.5837303400039673, + "sampling/importance_sampling_ratio/max": 1.4571188688278198, + "sampling/importance_sampling_ratio/mean": 0.7654373645782471, + "sampling/importance_sampling_ratio/min": 0.521602988243103, + "sampling/sampling_logp_difference/max": 0.4150291681289673, + "sampling/sampling_logp_difference/mean": 0.024981288239359856, + "step": 415, + "step_time": 38.211065392009914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 57.625, + "completions/mean_terminated_length": 57.625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3610576391220093, + "epoch": 0.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0423651933670044, + "kl": 0.014878635294735432, + "learning_rate": 3.2956074515686105e-06, + "loss": 0.1218, + "num_tokens": 2324058.0, + "reward": 0.1875, + "reward_std": 0.5408138036727905, + "rewards/reward_func/mean": 0.1875, + "rewards/reward_func/std": 0.5013624429702759, + "sampling/importance_sampling_ratio/max": 1.7271286249160767, + "sampling/importance_sampling_ratio/mean": 0.9100079536437988, + "sampling/importance_sampling_ratio/min": 0.40812426805496216, + "sampling/sampling_logp_difference/max": 0.3937739133834839, + "sampling/sampling_logp_difference/mean": 0.02358619123697281, + "step": 416, + "step_time": 41.3646322049899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.3718615770339966, + "epoch": 0.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7263632416725159, + "kl": 0.02011047676205635, + "learning_rate": 3.2879273677943972e-06, + "loss": 0.0355, + "num_tokens": 2329647.0, + "reward": 0.17874999344348907, + "reward_std": 0.32287517189979553, + "rewards/reward_func/mean": 0.17874999344348907, + "rewards/reward_func/std": 0.49820929765701294, + "sampling/importance_sampling_ratio/max": 1.2329703569412231, + "sampling/importance_sampling_ratio/mean": 0.6824724674224854, + "sampling/importance_sampling_ratio/min": 0.3652595281600952, + "sampling/sampling_logp_difference/max": 0.40844106674194336, + "sampling/sampling_logp_difference/mean": 0.027271784842014313, + "step": 417, + "step_time": 43.37971075499081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.360298216342926, + "epoch": 0.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1417040824890137, + "kl": 0.017974980175495148, + "learning_rate": 3.2802390190349364e-06, + "loss": 0.2216, + "num_tokens": 2336044.0, + "reward": 0.10249999910593033, + "reward_std": 0.26619309186935425, + "rewards/reward_func/mean": 0.10249999910593033, + "rewards/reward_func/std": 0.363426148891449, + "sampling/importance_sampling_ratio/max": 1.966254472732544, + "sampling/importance_sampling_ratio/mean": 0.8495919108390808, + "sampling/importance_sampling_ratio/min": 0.25239256024360657, + "sampling/sampling_logp_difference/max": 1.2355303764343262, + "sampling/sampling_logp_difference/mean": 0.029449839144945145, + "step": 418, + "step_time": 45.005784367007436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.35880693793296814, + "epoch": 0.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6445991396903992, + "kl": 0.02371111512184143, + "learning_rate": 3.272542485937369e-06, + "loss": -0.1228, + "num_tokens": 2341625.0, + "reward": -0.04999999701976776, + "reward_std": 0.04591917246580124, + "rewards/reward_func/mean": -0.04999999701976776, + "rewards/reward_func/std": 0.04309457913041115, + "sampling/importance_sampling_ratio/max": 1.6012232303619385, + "sampling/importance_sampling_ratio/mean": 0.7206702828407288, + "sampling/importance_sampling_ratio/min": 0.2793715298175812, + "sampling/sampling_logp_difference/max": 0.6976406574249268, + "sampling/sampling_logp_difference/mean": 0.02574693039059639, + "step": 419, + "step_time": 40.036054764001165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 50.0, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.38778284192085266, + "epoch": 0.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8026552796363831, + "kl": 0.021173562854528427, + "learning_rate": 3.264837849234685e-06, + "loss": -0.0381, + "num_tokens": 2348306.0, + "reward": -0.0625, + "reward_std": 0.05533730238676071, + "rewards/reward_func/mean": -0.0625, + "rewards/reward_func/std": 0.054967522621154785, + "sampling/importance_sampling_ratio/max": 2.244253396987915, + "sampling/importance_sampling_ratio/mean": 1.0164496898651123, + "sampling/importance_sampling_ratio/min": 0.2759284973144531, + "sampling/sampling_logp_difference/max": 0.572641134262085, + "sampling/sampling_logp_difference/mean": 0.02503090165555477, + "step": 420, + "step_time": 48.014933019003365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 59.75, + "completions/mean_terminated_length": 59.75, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.3811195492744446, + "epoch": 0.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8657627701759338, + "kl": 0.011216644197702408, + "learning_rate": 3.257125189744877e-06, + "loss": -0.1309, + "num_tokens": 2353901.0, + "reward": 0.3362500071525574, + "reward_std": 0.565845787525177, + "rewards/reward_func/mean": 0.3362500071525574, + "rewards/reward_func/std": 0.5429532527923584, + "sampling/importance_sampling_ratio/max": 1.4307345151901245, + "sampling/importance_sampling_ratio/mean": 0.824535608291626, + "sampling/importance_sampling_ratio/min": 0.5268330574035645, + "sampling/sampling_logp_difference/max": 0.5823192000389099, + "sampling/sampling_logp_difference/mean": 0.021394170820713043, + "step": 421, + "step_time": 35.67191677300434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.32794663310050964, + "epoch": 0.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6034460067749023, + "kl": 0.026636671274900436, + "learning_rate": 3.249404588370095e-06, + "loss": 0.2555, + "num_tokens": 2358707.0, + "reward": 0.4362500309944153, + "reward_std": 0.5625, + "rewards/reward_func/mean": 0.4362500309944153, + "rewards/reward_func/std": 0.6054971218109131, + "sampling/importance_sampling_ratio/max": 1.9666376113891602, + "sampling/importance_sampling_ratio/mean": 1.0515501499176025, + "sampling/importance_sampling_ratio/min": 0.25995469093322754, + "sampling/sampling_logp_difference/max": 0.5747603178024292, + "sampling/sampling_logp_difference/mean": 0.02525373175740242, + "step": 422, + "step_time": 33.49426256099832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 62.125, + "completions/mean_terminated_length": 62.125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.32681041955947876, + "epoch": 0.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7744244337081909, + "kl": 0.007102203089743853, + "learning_rate": 3.2416761260957925e-06, + "loss": 0.043, + "num_tokens": 2364549.0, + "reward": -0.06749999523162842, + "reward_std": 0.03864005580544472, + "rewards/reward_func/mean": -0.06749999523162842, + "rewards/reward_func/std": 0.04131758585572243, + "sampling/importance_sampling_ratio/max": 2.1614413261413574, + "sampling/importance_sampling_ratio/mean": 1.109354019165039, + "sampling/importance_sampling_ratio/min": 0.5632383823394775, + "sampling/sampling_logp_difference/max": 0.3571450710296631, + "sampling/sampling_logp_difference/mean": 0.020130092278122902, + "step": 423, + "step_time": 40.15369539499807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 57.375, + "completions/mean_terminated_length": 57.375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.31878265738487244, + "epoch": 0.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3974499702453613, + "kl": 0.032708972692489624, + "learning_rate": 3.233939883989882e-06, + "loss": -0.0634, + "num_tokens": 2370417.0, + "reward": 0.20625001192092896, + "reward_std": 0.48608773946762085, + "rewards/reward_func/mean": 0.20625001192092896, + "rewards/reward_func/std": 0.45102858543395996, + "sampling/importance_sampling_ratio/max": 2.078763246536255, + "sampling/importance_sampling_ratio/mean": 1.177141547203064, + "sampling/importance_sampling_ratio/min": 0.7995861768722534, + "sampling/sampling_logp_difference/max": 0.32161664962768555, + "sampling/sampling_logp_difference/mean": 0.016767999157309532, + "step": 424, + "step_time": 39.82506796899543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 58.5, + "completions/mean_terminated_length": 58.5, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.38849684596061707, + "epoch": 0.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8255501389503479, + "kl": 0.013021498918533325, + "learning_rate": 3.2261959432018834e-06, + "loss": -0.1225, + "num_tokens": 2375900.0, + "reward": 0.4375, + "reward_std": 0.5317496061325073, + "rewards/reward_func/mean": 0.4375, + "rewards/reward_func/std": 0.5842394232749939, + "sampling/importance_sampling_ratio/max": 2.1501243114471436, + "sampling/importance_sampling_ratio/mean": 0.9571436643600464, + "sampling/importance_sampling_ratio/min": 0.4914774000644684, + "sampling/sampling_logp_difference/max": 0.5219483375549316, + "sampling/sampling_logp_difference/mean": 0.025387398898601532, + "step": 425, + "step_time": 39.655509564006934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.33532607555389404, + "epoch": 0.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8230807781219482, + "kl": 0.01512499526143074, + "learning_rate": 3.218444384962071e-06, + "loss": -0.054, + "num_tokens": 2381685.0, + "reward": 0.3175000250339508, + "reward_std": 0.2747558355331421, + "rewards/reward_func/mean": 0.3175000250339508, + "rewards/reward_func/std": 0.5468807816505432, + "sampling/importance_sampling_ratio/max": 1.088066816329956, + "sampling/importance_sampling_ratio/mean": 0.7654009461402893, + "sampling/importance_sampling_ratio/min": 0.43912702798843384, + "sampling/sampling_logp_difference/max": 0.6466556787490845, + "sampling/sampling_logp_difference/mean": 0.025085650384426117, + "step": 426, + "step_time": 41.43392355799733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.33623552322387695, + "epoch": 0.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1959271430969238, + "kl": 0.015206445939838886, + "learning_rate": 3.210685290580622e-06, + "loss": 0.0497, + "num_tokens": 2386757.0, + "reward": 0.20374999940395355, + "reward_std": 0.5192359089851379, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.4818398654460907, + "sampling/importance_sampling_ratio/max": 1.3981304168701172, + "sampling/importance_sampling_ratio/mean": 0.8693457245826721, + "sampling/importance_sampling_ratio/min": 0.3326525390148163, + "sampling/sampling_logp_difference/max": 0.46013569831848145, + "sampling/sampling_logp_difference/mean": 0.02562933787703514, + "step": 427, + "step_time": 32.31069487700006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 53.0, + "completions/mean_terminated_length": 53.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.35203301906585693, + "epoch": 0.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9189528226852417, + "kl": 0.015850629657506943, + "learning_rate": 3.2029187414467645e-06, + "loss": -0.017, + "num_tokens": 2392704.0, + "reward": 0.3450000286102295, + "reward_std": 0.5642583966255188, + "rewards/reward_func/mean": 0.3450000286102295, + "rewards/reward_func/std": 0.5430338382720947, + "sampling/importance_sampling_ratio/max": 1.4162225723266602, + "sampling/importance_sampling_ratio/mean": 0.7990955114364624, + "sampling/importance_sampling_ratio/min": 0.42359423637390137, + "sampling/sampling_logp_difference/max": 0.6078430414199829, + "sampling/sampling_logp_difference/mean": 0.02648048661649227, + "step": 428, + "step_time": 33.69458929898974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 52.875, + "completions/mean_terminated_length": 52.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3973723351955414, + "epoch": 0.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6990527510643005, + "kl": 0.012423260137438774, + "learning_rate": 3.1951448190279256e-06, + "loss": 0.0692, + "num_tokens": 2398486.0, + "reward": 0.0949999988079071, + "reward_std": 0.26899394392967224, + "rewards/reward_func/mean": 0.0949999988079071, + "rewards/reward_func/std": 0.36621618270874023, + "sampling/importance_sampling_ratio/max": 1.2071622610092163, + "sampling/importance_sampling_ratio/mean": 0.8269263505935669, + "sampling/importance_sampling_ratio/min": 0.41573214530944824, + "sampling/sampling_logp_difference/max": 0.488450288772583, + "sampling/sampling_logp_difference/mean": 0.023476937785744667, + "step": 429, + "step_time": 35.47933077499329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.36242973804473877, + "epoch": 0.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9864727258682251, + "kl": 0.010072307661175728, + "learning_rate": 3.1873636048688714e-06, + "loss": -0.1138, + "num_tokens": 2403892.0, + "reward": 0.33000001311302185, + "reward_std": 0.5705651044845581, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5465214252471924, + "sampling/importance_sampling_ratio/max": 1.5361573696136475, + "sampling/importance_sampling_ratio/mean": 1.0155696868896484, + "sampling/importance_sampling_ratio/min": 0.7467938661575317, + "sampling/sampling_logp_difference/max": 0.23247402906417847, + "sampling/sampling_logp_difference/mean": 0.01987922564148903, + "step": 430, + "step_time": 29.146404054001323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.35852545499801636, + "epoch": 0.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.280659556388855, + "kl": 0.02400248870253563, + "learning_rate": 3.1795751805908578e-06, + "loss": -0.0427, + "num_tokens": 2409261.0, + "reward": 0.20374999940395355, + "reward_std": 0.5336059331893921, + "rewards/reward_func/mean": 0.20374999940395355, + "rewards/reward_func/std": 0.4941641688346863, + "sampling/importance_sampling_ratio/max": 1.468123197555542, + "sampling/importance_sampling_ratio/mean": 0.9840533137321472, + "sampling/importance_sampling_ratio/min": 0.4497135579586029, + "sampling/sampling_logp_difference/max": 0.39689433574676514, + "sampling/sampling_logp_difference/mean": 0.021220847964286804, + "step": 431, + "step_time": 33.816670406013145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3159053921699524, + "epoch": 0.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1149163246154785, + "kl": 0.01616514101624489, + "learning_rate": 3.171779627890769e-06, + "loss": -0.2228, + "num_tokens": 2413916.0, + "reward": 0.09000000357627869, + "reward_std": 0.25971826910972595, + "rewards/reward_func/mean": 0.09000000357627869, + "rewards/reward_func/std": 0.3446737825870514, + "sampling/importance_sampling_ratio/max": 1.8468085527420044, + "sampling/importance_sampling_ratio/mean": 1.1968073844909668, + "sampling/importance_sampling_ratio/min": 0.751221776008606, + "sampling/sampling_logp_difference/max": 0.3156614303588867, + "sampling/sampling_logp_difference/mean": 0.02191685512661934, + "step": 432, + "step_time": 33.72210700699361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.31895625591278076, + "epoch": 0.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546246767044067, + "kl": 0.020721694454550743, + "learning_rate": 3.1639770285402632e-06, + "loss": 0.0157, + "num_tokens": 2419091.0, + "reward": 0.1862500011920929, + "reward_std": 0.5399943590164185, + "rewards/reward_func/mean": 0.1862500011920929, + "rewards/reward_func/std": 0.5013107657432556, + "sampling/importance_sampling_ratio/max": 2.4649336338043213, + "sampling/importance_sampling_ratio/mean": 0.9663840532302856, + "sampling/importance_sampling_ratio/min": 0.36142030358314514, + "sampling/sampling_logp_difference/max": 0.4676198959350586, + "sampling/sampling_logp_difference/mean": 0.024064481258392334, + "step": 433, + "step_time": 30.73073644300166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 55.125, + "completions/mean_terminated_length": 55.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.39400357007980347, + "epoch": 0.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1748162508010864, + "kl": 0.023680763319134712, + "learning_rate": 3.1561674643849173e-06, + "loss": 0.2299, + "num_tokens": 2424823.0, + "reward": -0.04749999940395355, + "reward_std": 0.035702817142009735, + "rewards/reward_func/mean": -0.04749999940395355, + "rewards/reward_func/std": 0.03845219686627388, + "sampling/importance_sampling_ratio/max": 2.1316819190979004, + "sampling/importance_sampling_ratio/mean": 1.2642168998718262, + "sampling/importance_sampling_ratio/min": 0.5596238374710083, + "sampling/sampling_logp_difference/max": 0.49070703983306885, + "sampling/sampling_logp_difference/mean": 0.026236988604068756, + "step": 434, + "step_time": 46.65107938699657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.381875604391098, + "epoch": 0.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.038621187210083, + "kl": 0.016581958159804344, + "learning_rate": 3.148351017343363e-06, + "loss": 0.0706, + "num_tokens": 2431458.0, + "reward": 0.3075000047683716, + "reward_std": 0.28910407423973083, + "rewards/reward_func/mean": 0.3075000047683716, + "rewards/reward_func/std": 0.5689526796340942, + "sampling/importance_sampling_ratio/max": 1.7760720252990723, + "sampling/importance_sampling_ratio/mean": 1.0361217260360718, + "sampling/importance_sampling_ratio/min": 0.5515484809875488, + "sampling/sampling_logp_difference/max": 0.5586767196655273, + "sampling/sampling_logp_difference/mean": 0.025216208770871162, + "step": 435, + "step_time": 51.65877502699732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.33468616008758545, + "epoch": 0.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9653282761573792, + "kl": 0.013657507486641407, + "learning_rate": 3.1405277694064306e-06, + "loss": 0.1378, + "num_tokens": 2437082.0, + "reward": 0.5924999713897705, + "reward_std": 0.5440975427627563, + "rewards/reward_func/mean": 0.5924999713897705, + "rewards/reward_func/std": 0.5312720537185669, + "sampling/importance_sampling_ratio/max": 1.5817586183547974, + "sampling/importance_sampling_ratio/mean": 0.9748501777648926, + "sampling/importance_sampling_ratio/min": 0.4865840971469879, + "sampling/sampling_logp_difference/max": 0.43617498874664307, + "sampling/sampling_logp_difference/mean": 0.022868365049362183, + "step": 436, + "step_time": 33.30673982600274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 51.375, + "completions/mean_terminated_length": 51.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3179436922073364, + "epoch": 0.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7460964918136597, + "kl": 0.009489338845014572, + "learning_rate": 3.1326978026362907e-06, + "loss": 0.2029, + "num_tokens": 2443073.0, + "reward": 0.22499999403953552, + "reward_std": 0.3186895549297333, + "rewards/reward_func/mean": 0.22499999403953552, + "rewards/reward_func/std": 0.4789273738861084, + "sampling/importance_sampling_ratio/max": 2.7308499813079834, + "sampling/importance_sampling_ratio/mean": 1.1698341369628906, + "sampling/importance_sampling_ratio/min": 0.4456416368484497, + "sampling/sampling_logp_difference/max": 0.751288652420044, + "sampling/sampling_logp_difference/mean": 0.02368471771478653, + "step": 437, + "step_time": 56.14041445599287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 49.375, + "completions/mean_terminated_length": 49.375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.33749181032180786, + "epoch": 0.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.662105679512024, + "kl": 0.013462947681546211, + "learning_rate": 3.1248611991655885e-06, + "loss": -0.0522, + "num_tokens": 2448957.0, + "reward": 0.32500001788139343, + "reward_std": 0.5697466135025024, + "rewards/reward_func/mean": 0.32500001788139343, + "rewards/reward_func/std": 0.554307758808136, + "sampling/importance_sampling_ratio/max": 2.088134288787842, + "sampling/importance_sampling_ratio/mean": 1.2728486061096191, + "sampling/importance_sampling_ratio/min": 0.7122271656990051, + "sampling/sampling_logp_difference/max": 1.059885025024414, + "sampling/sampling_logp_difference/mean": 0.023971613496541977, + "step": 438, + "step_time": 40.55881189700449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.39506107568740845, + "epoch": 0.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9108272194862366, + "kl": 0.012316450476646423, + "learning_rate": 3.1170180411965854e-06, + "loss": -0.0778, + "num_tokens": 2455782.0, + "reward": 0.05125000327825546, + "reward_std": 0.2902706265449524, + "rewards/reward_func/mean": 0.05125000327825546, + "rewards/reward_func/std": 0.386502206325531, + "sampling/importance_sampling_ratio/max": 1.1981909275054932, + "sampling/importance_sampling_ratio/mean": 0.7162038087844849, + "sampling/importance_sampling_ratio/min": 0.44226300716400146, + "sampling/sampling_logp_difference/max": 0.5306471586227417, + "sampling/sampling_logp_difference/mean": 0.023498659953475, + "step": 439, + "step_time": 52.448768703994574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 58.5, + "completions/mean_terminated_length": 58.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.37565258145332336, + "epoch": 0.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8730466365814209, + "kl": 0.11407122761011124, + "learning_rate": 3.109168411000299e-06, + "loss": 0.0533, + "num_tokens": 2460686.0, + "reward": 0.20750001072883606, + "reward_std": 0.5263463258743286, + "rewards/reward_func/mean": 0.20750001072883606, + "rewards/reward_func/std": 0.4876108169555664, + "sampling/importance_sampling_ratio/max": 1.0784844160079956, + "sampling/importance_sampling_ratio/mean": 0.675607442855835, + "sampling/importance_sampling_ratio/min": 0.0301599632948637, + "sampling/sampling_logp_difference/max": 1.5674490928649902, + "sampling/sampling_logp_difference/mean": 0.030301496386528015, + "step": 440, + "step_time": 40.517616327008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 57.125, + "completions/mean_terminated_length": 57.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.34737882018089294, + "epoch": 0.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7645708918571472, + "kl": 0.01040720660239458, + "learning_rate": 3.1013123909156347e-06, + "loss": 0.2671, + "num_tokens": 2465679.0, + "reward": 0.0637499988079071, + "reward_std": 0.2704658508300781, + "rewards/reward_func/mean": 0.0637499988079071, + "rewards/reward_func/std": 0.3515653908252716, + "sampling/importance_sampling_ratio/max": 1.5223848819732666, + "sampling/importance_sampling_ratio/mean": 1.0293443202972412, + "sampling/importance_sampling_ratio/min": 0.5342814326286316, + "sampling/sampling_logp_difference/max": 0.3400125503540039, + "sampling/sampling_logp_difference/mean": 0.019414888694882393, + "step": 441, + "step_time": 32.846883123987936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 52.75, + "completions/mean_terminated_length": 52.75, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.3078349232673645, + "epoch": 0.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4805446863174438, + "kl": 0.011887718923389912, + "learning_rate": 3.093450063348525e-06, + "loss": 0.193, + "num_tokens": 2471241.0, + "reward": 0.34375, + "reward_std": 0.28617510199546814, + "rewards/reward_func/mean": 0.34375, + "rewards/reward_func/std": 0.5389391183853149, + "sampling/importance_sampling_ratio/max": 2.141223192214966, + "sampling/importance_sampling_ratio/mean": 1.1272714138031006, + "sampling/importance_sampling_ratio/min": 0.6114374995231628, + "sampling/sampling_logp_difference/max": 0.6033744812011719, + "sampling/sampling_logp_difference/mean": 0.01826009526848793, + "step": 442, + "step_time": 36.22709542399389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3010959029197693, + "epoch": 0.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4849302768707275, + "kl": 0.009227529168128967, + "learning_rate": 3.085581510771067e-06, + "loss": -0.141, + "num_tokens": 2476093.0, + "reward": 0.3187499940395355, + "reward_std": 0.2881261706352234, + "rewards/reward_func/mean": 0.3187499940395355, + "rewards/reward_func/std": 0.5388993620872498, + "sampling/importance_sampling_ratio/max": 1.947962760925293, + "sampling/importance_sampling_ratio/mean": 1.3393418788909912, + "sampling/importance_sampling_ratio/min": 0.5013647675514221, + "sampling/sampling_logp_difference/max": 0.4786471128463745, + "sampling/sampling_logp_difference/mean": 0.01934540644288063, + "step": 443, + "step_time": 34.268270688000484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 51.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.33479422330856323, + "epoch": 0.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9843921065330505, + "kl": 0.008889885619282722, + "learning_rate": 3.0777068157206535e-06, + "loss": -0.1042, + "num_tokens": 2481969.0, + "reward": 0.3225000202655792, + "reward_std": 0.2926766574382782, + "rewards/reward_func/mean": 0.3225000202655792, + "rewards/reward_func/std": 0.5274940729141235, + "sampling/importance_sampling_ratio/max": 1.5019242763519287, + "sampling/importance_sampling_ratio/mean": 0.9874775409698486, + "sampling/importance_sampling_ratio/min": 0.6516150236129761, + "sampling/sampling_logp_difference/max": 0.47698545455932617, + "sampling/sampling_logp_difference/mean": 0.022217225283384323, + "step": 444, + "step_time": 37.88683218799997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.347456157207489, + "epoch": 0.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2607645988464355, + "kl": 0.011377193965017796, + "learning_rate": 3.0698260607991094e-06, + "loss": -0.1942, + "num_tokens": 2486972.0, + "reward": 0.061250001192092896, + "reward_std": 0.2965749502182007, + "rewards/reward_func/mean": 0.061250001192092896, + "rewards/reward_func/std": 0.3812175393104553, + "sampling/importance_sampling_ratio/max": 1.8228760957717896, + "sampling/importance_sampling_ratio/mean": 0.942634105682373, + "sampling/importance_sampling_ratio/min": 0.46362996101379395, + "sampling/sampling_logp_difference/max": 0.3296375274658203, + "sampling/sampling_logp_difference/mean": 0.01900862343609333, + "step": 445, + "step_time": 36.2604122460034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 46.625, + "completions/mean_terminated_length": 46.625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.37372976541519165, + "epoch": 0.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0763218402862549, + "kl": 0.01523488201200962, + "learning_rate": 3.061939328671824e-06, + "loss": 0.2346, + "num_tokens": 2492375.0, + "reward": 0.45375001430511475, + "reward_std": 0.6083322763442993, + "rewards/reward_func/mean": 0.45375001430511475, + "rewards/reward_func/std": 0.5633050203323364, + "sampling/importance_sampling_ratio/max": 1.4935482740402222, + "sampling/importance_sampling_ratio/mean": 0.9692554473876953, + "sampling/importance_sampling_ratio/min": 0.3600374758243561, + "sampling/sampling_logp_difference/max": 0.45327699184417725, + "sampling/sampling_logp_difference/mean": 0.023247534409165382, + "step": 446, + "step_time": 31.12013271600881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3365999460220337, + "epoch": 0.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8528043031692505, + "kl": 0.014648355543613434, + "learning_rate": 3.054046702066886e-06, + "loss": -0.0107, + "num_tokens": 2498064.0, + "reward": 0.45374998450279236, + "reward_std": 0.5031688809394836, + "rewards/reward_func/mean": 0.45374998450279236, + "rewards/reward_func/std": 0.5408442616462708, + "sampling/importance_sampling_ratio/max": 1.301316499710083, + "sampling/importance_sampling_ratio/mean": 0.8368029594421387, + "sampling/importance_sampling_ratio/min": 0.5832191109657288, + "sampling/sampling_logp_difference/max": 0.5037274360656738, + "sampling/sampling_logp_difference/mean": 0.02292271889746189, + "step": 447, + "step_time": 38.94344817800447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 55.0, + "completions/mean_terminated_length": 55.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3609512448310852, + "epoch": 0.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.116279125213623, + "kl": 0.00902464147657156, + "learning_rate": 3.0461482637742133e-06, + "loss": 0.3113, + "num_tokens": 2504084.0, + "reward": 0.20375001430511475, + "reward_std": 0.32907286286354065, + "rewards/reward_func/mean": 0.20375001430511475, + "rewards/reward_func/std": 0.483822226524353, + "sampling/importance_sampling_ratio/max": 2.460287570953369, + "sampling/importance_sampling_ratio/mean": 1.2798258066177368, + "sampling/importance_sampling_ratio/min": 0.588743269443512, + "sampling/sampling_logp_difference/max": 0.512649416923523, + "sampling/sampling_logp_difference/mean": 0.021559733897447586, + "step": 448, + "step_time": 42.009523353000986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.350589394569397, + "epoch": 0.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.955585241317749, + "kl": 0.0077981045469641685, + "learning_rate": 3.0382440966446876e-06, + "loss": -0.048, + "num_tokens": 2509876.0, + "reward": 0.4662500023841858, + "reward_std": 0.5135143995285034, + "rewards/reward_func/mean": 0.4662500023841858, + "rewards/reward_func/std": 0.5347346067428589, + "sampling/importance_sampling_ratio/max": 1.597688913345337, + "sampling/importance_sampling_ratio/mean": 1.0106717348098755, + "sampling/importance_sampling_ratio/min": 0.5116551518440247, + "sampling/sampling_logp_difference/max": 0.40577125549316406, + "sampling/sampling_logp_difference/mean": 0.026103615760803223, + "step": 449, + "step_time": 38.17177925199212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 57.25, + "completions/mean_terminated_length": 57.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.34553277492523193, + "epoch": 0.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7108585238456726, + "kl": 0.010503964498639107, + "learning_rate": 3.0303342835892804e-06, + "loss": -0.1033, + "num_tokens": 2516447.0, + "reward": 0.20999999344348907, + "reward_std": 0.32175832986831665, + "rewards/reward_func/mean": 0.20999999344348907, + "rewards/reward_func/std": 0.4881744682788849, + "sampling/importance_sampling_ratio/max": 1.5398190021514893, + "sampling/importance_sampling_ratio/mean": 0.9388370513916016, + "sampling/importance_sampling_ratio/min": 0.38391628861427307, + "sampling/sampling_logp_difference/max": 0.45900917053222656, + "sampling/sampling_logp_difference/mean": 0.020982615649700165, + "step": 450, + "step_time": 43.6907909319998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.32224124670028687, + "epoch": 0.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2028799057006836, + "kl": 0.016997255384922028, + "learning_rate": 3.0224189075781886e-06, + "loss": 0.0639, + "num_tokens": 2522872.0, + "reward": 0.10750000178813934, + "reward_std": 0.2686707377433777, + "rewards/reward_func/mean": 0.10750000178813934, + "rewards/reward_func/std": 0.3612181842327118, + "sampling/importance_sampling_ratio/max": 1.7533460855484009, + "sampling/importance_sampling_ratio/mean": 1.1469990015029907, + "sampling/importance_sampling_ratio/min": 0.3751453459262848, + "sampling/sampling_logp_difference/max": 0.5441827774047852, + "sampling/sampling_logp_difference/mean": 0.020926637575030327, + "step": 451, + "step_time": 44.405825520996586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.39713117480278015, + "epoch": 0.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0681439638137817, + "kl": 0.01726282574236393, + "learning_rate": 3.014498051639959e-06, + "loss": -0.1024, + "num_tokens": 2528540.0, + "reward": -0.027499999850988388, + "reward_std": 0.024149831384420395, + "rewards/reward_func/mean": -0.027499999850988388, + "rewards/reward_func/std": 0.022519832476973534, + "sampling/importance_sampling_ratio/max": 1.3811919689178467, + "sampling/importance_sampling_ratio/mean": 0.9916707277297974, + "sampling/importance_sampling_ratio/min": 0.5636754631996155, + "sampling/sampling_logp_difference/max": 0.34890270233154297, + "sampling/sampling_logp_difference/mean": 0.025962986052036285, + "step": 452, + "step_time": 39.69970705700689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 56.125, + "completions/mean_terminated_length": 56.125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.36409592628479004, + "epoch": 0.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9367609024047852, + "kl": 0.008488480933010578, + "learning_rate": 3.006571798860626e-06, + "loss": 0.0109, + "num_tokens": 2533758.0, + "reward": 0.4700000286102295, + "reward_std": 0.4986118674278259, + "rewards/reward_func/mean": 0.4700000286102295, + "rewards/reward_func/std": 0.5269860625267029, + "sampling/importance_sampling_ratio/max": 1.6330208778381348, + "sampling/importance_sampling_ratio/mean": 1.038517951965332, + "sampling/importance_sampling_ratio/min": 0.5776368975639343, + "sampling/sampling_logp_difference/max": 0.4428684711456299, + "sampling/sampling_logp_difference/mean": 0.02270490676164627, + "step": 453, + "step_time": 31.74909894198936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 58.0, + "completions/mean_terminated_length": 58.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.3147682547569275, + "epoch": 0.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5675761103630066, + "kl": 0.007853170856833458, + "learning_rate": 2.9986402323828274e-06, + "loss": -0.1266, + "num_tokens": 2539130.0, + "reward": 0.18000000715255737, + "reward_std": 0.35241687297821045, + "rewards/reward_func/mean": 0.18000000715255737, + "rewards/reward_func/std": 0.5112450122833252, + "sampling/importance_sampling_ratio/max": 0.7927283644676208, + "sampling/importance_sampling_ratio/mean": 0.6180651187896729, + "sampling/importance_sampling_ratio/min": 0.16511119902133942, + "sampling/sampling_logp_difference/max": 0.6556259393692017, + "sampling/sampling_logp_difference/mean": 0.022515466436743736, + "step": 454, + "step_time": 38.15574240600108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3250408172607422, + "epoch": 0.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9744404554367065, + "kl": 0.008251778781414032, + "learning_rate": 2.9907034354049443e-06, + "loss": 0.1804, + "num_tokens": 2544571.0, + "reward": 0.44749999046325684, + "reward_std": 0.5988933444023132, + "rewards/reward_func/mean": 0.44749999046325684, + "rewards/reward_func/std": 0.5548165440559387, + "sampling/importance_sampling_ratio/max": 1.5385011434555054, + "sampling/importance_sampling_ratio/mean": 0.9114477634429932, + "sampling/importance_sampling_ratio/min": 0.29668596386909485, + "sampling/sampling_logp_difference/max": 0.35155487060546875, + "sampling/sampling_logp_difference/mean": 0.02304825559258461, + "step": 455, + "step_time": 33.35669154900825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 47.875, + "completions/mean_terminated_length": 47.875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3169974982738495, + "epoch": 0.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.806413471698761, + "kl": 0.013721331022679806, + "learning_rate": 2.9827614911802205e-06, + "loss": -0.0326, + "num_tokens": 2550599.0, + "reward": 0.5987499952316284, + "reward_std": 0.5605678558349609, + "rewards/reward_func/mean": 0.5987499952316284, + "rewards/reward_func/std": 0.543413519859314, + "sampling/importance_sampling_ratio/max": 1.1396749019622803, + "sampling/importance_sampling_ratio/mean": 0.8662426471710205, + "sampling/importance_sampling_ratio/min": 0.5802029371261597, + "sampling/sampling_logp_difference/max": 0.41710424423217773, + "sampling/sampling_logp_difference/mean": 0.018221460282802582, + "step": 456, + "step_time": 28.43808863000595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 51.25, + "completions/mean_terminated_length": 51.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3788241744041443, + "epoch": 0.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3950449228286743, + "kl": 0.009939423762261868, + "learning_rate": 2.9748144830158925e-06, + "loss": -0.1656, + "num_tokens": 2555995.0, + "reward": 0.32499998807907104, + "reward_std": 0.5697804093360901, + "rewards/reward_func/mean": 0.32499998807907104, + "rewards/reward_func/std": 0.5482960343360901, + "sampling/importance_sampling_ratio/max": 1.5532302856445312, + "sampling/importance_sampling_ratio/mean": 1.2069408893585205, + "sampling/importance_sampling_ratio/min": 0.7212671041488647, + "sampling/sampling_logp_difference/max": 0.6211786270141602, + "sampling/sampling_logp_difference/mean": 0.020913410931825638, + "step": 457, + "step_time": 32.56511296798999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.3860653042793274, + "epoch": 0.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.918066680431366, + "kl": 0.0109860859811306, + "learning_rate": 2.966862494272316e-06, + "loss": 0.0279, + "num_tokens": 2561552.0, + "reward": 0.33000001311302185, + "reward_std": 0.5475718975067139, + "rewards/reward_func/mean": 0.33000001311302185, + "rewards/reward_func/std": 0.5318431854248047, + "sampling/importance_sampling_ratio/max": 1.018545389175415, + "sampling/importance_sampling_ratio/mean": 0.7079232931137085, + "sampling/importance_sampling_ratio/min": 0.3387696444988251, + "sampling/sampling_logp_difference/max": 0.3644402027130127, + "sampling/sampling_logp_difference/mean": 0.028172709047794342, + "step": 458, + "step_time": 43.0941359270073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.34271055459976196, + "epoch": 0.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9301976561546326, + "kl": 0.03799736499786377, + "learning_rate": 2.9589056083620902e-06, + "loss": 0.1021, + "num_tokens": 2566731.0, + "reward": 0.1875, + "reward_std": 0.531709611415863, + "rewards/reward_func/mean": 0.1875, + "rewards/reward_func/std": 0.49430328607559204, + "sampling/importance_sampling_ratio/max": 1.698185920715332, + "sampling/importance_sampling_ratio/mean": 0.8187704682350159, + "sampling/importance_sampling_ratio/min": 0.2524738907814026, + "sampling/sampling_logp_difference/max": 0.7073307037353516, + "sampling/sampling_logp_difference/mean": 0.029650993645191193, + "step": 459, + "step_time": 36.267561529995874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3156815767288208, + "epoch": 0.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8615498542785645, + "kl": 0.014610467478632927, + "learning_rate": 2.9509439087491837e-06, + "loss": 0.071, + "num_tokens": 2572071.0, + "reward": 0.33250001072883606, + "reward_std": 0.5657204389572144, + "rewards/reward_func/mean": 0.33250001072883606, + "rewards/reward_func/std": 0.546776294708252, + "sampling/importance_sampling_ratio/max": 1.6013303995132446, + "sampling/importance_sampling_ratio/mean": 0.9272360801696777, + "sampling/importance_sampling_ratio/min": 0.5939593315124512, + "sampling/sampling_logp_difference/max": 0.4442490339279175, + "sampling/sampling_logp_difference/mean": 0.0204640980809927, + "step": 460, + "step_time": 38.44636067999818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 47.375, + "completions/mean_terminated_length": 47.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.33302855491638184, + "epoch": 0.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6637190580368042, + "kl": 0.05872795730829239, + "learning_rate": 2.9429774789480576e-06, + "loss": 0.0747, + "num_tokens": 2577004.0, + "reward": 0.33500000834465027, + "reward_std": 0.5313875675201416, + "rewards/reward_func/mean": 0.33500000834465027, + "rewards/reward_func/std": 0.5113846659660339, + "sampling/importance_sampling_ratio/max": 1.9112398624420166, + "sampling/importance_sampling_ratio/mean": 0.9847674369812012, + "sampling/importance_sampling_ratio/min": 0.5912163853645325, + "sampling/sampling_logp_difference/max": 0.649017333984375, + "sampling/sampling_logp_difference/mean": 0.025793246924877167, + "step": 461, + "step_time": 27.65348793999874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3585735261440277, + "epoch": 0.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5941725969314575, + "kl": 0.00782017782330513, + "learning_rate": 2.93500640252279e-06, + "loss": -0.4099, + "num_tokens": 2582404.0, + "reward": 0.07625000178813934, + "reward_std": 0.2834315598011017, + "rewards/reward_func/mean": 0.07625000178813934, + "rewards/reward_func/std": 0.3762194514274597, + "sampling/importance_sampling_ratio/max": 2.3709073066711426, + "sampling/importance_sampling_ratio/mean": 1.1157333850860596, + "sampling/importance_sampling_ratio/min": 0.6296651363372803, + "sampling/sampling_logp_difference/max": 0.43739819526672363, + "sampling/sampling_logp_difference/mean": 0.02625291794538498, + "step": 462, + "step_time": 41.56208431599953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 56.625, + "completions/mean_terminated_length": 56.625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.3185637891292572, + "epoch": 0.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8803660273551941, + "kl": 0.006397469900548458, + "learning_rate": 2.927030763086201e-06, + "loss": 0.0036, + "num_tokens": 2587423.0, + "reward": 0.4750000238418579, + "reward_std": 0.5177191495895386, + "rewards/reward_func/mean": 0.4750000238418579, + "rewards/reward_func/std": 0.5487908720970154, + "sampling/importance_sampling_ratio/max": 1.4098753929138184, + "sampling/importance_sampling_ratio/mean": 0.9890186190605164, + "sampling/importance_sampling_ratio/min": 0.6722173690795898, + "sampling/sampling_logp_difference/max": 0.31026315689086914, + "sampling/sampling_logp_difference/mean": 0.01704924926161766, + "step": 463, + "step_time": 27.707606516996748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.375, + "completions/mean_terminated_length": 50.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.341752827167511, + "epoch": 0.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.072123646736145, + "kl": 0.00813986174762249, + "learning_rate": 2.9190506442989753e-06, + "loss": 0.0383, + "num_tokens": 2592652.0, + "reward": -0.01874999888241291, + "reward_std": 0.01129152812063694, + "rewards/reward_func/mean": -0.01874999888241291, + "rewards/reward_func/std": 0.011259916238486767, + "sampling/importance_sampling_ratio/max": 1.9521905183792114, + "sampling/importance_sampling_ratio/mean": 1.0056908130645752, + "sampling/importance_sampling_ratio/min": 0.4280635416507721, + "sampling/sampling_logp_difference/max": 0.29516351222991943, + "sampling/sampling_logp_difference/mean": 0.020366854965686798, + "step": 464, + "step_time": 41.832906912997714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 56.875, + "completions/mean_terminated_length": 56.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.33210664987564087, + "epoch": 0.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4356821775436401, + "kl": 0.010456573218107224, + "learning_rate": 2.9110661298687824e-06, + "loss": 0.1115, + "num_tokens": 2597994.0, + "reward": 0.4599999785423279, + "reward_std": 0.6102153062820435, + "rewards/reward_func/mean": 0.4599999785423279, + "rewards/reward_func/std": 0.5649778842926025, + "sampling/importance_sampling_ratio/max": 1.6905672550201416, + "sampling/importance_sampling_ratio/mean": 0.9656480550765991, + "sampling/importance_sampling_ratio/min": 0.4363991916179657, + "sampling/sampling_logp_difference/max": 0.4984140396118164, + "sampling/sampling_logp_difference/mean": 0.01931898109614849, + "step": 465, + "step_time": 38.68506367498776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3974885940551758, + "epoch": 0.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.112714171409607, + "kl": 0.007472895085811615, + "learning_rate": 2.9030773035493997e-06, + "loss": 0.1579, + "num_tokens": 2603516.0, + "reward": 0.45875000953674316, + "reward_std": 0.6050867438316345, + "rewards/reward_func/mean": 0.45875000953674316, + "rewards/reward_func/std": 0.5605975985527039, + "sampling/importance_sampling_ratio/max": 1.5251615047454834, + "sampling/importance_sampling_ratio/mean": 1.0866496562957764, + "sampling/importance_sampling_ratio/min": 0.5772762894630432, + "sampling/sampling_logp_difference/max": 0.33701562881469727, + "sampling/sampling_logp_difference/mean": 0.02309374138712883, + "step": 466, + "step_time": 30.25664325800608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3211430311203003, + "epoch": 0.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1987115144729614, + "kl": 0.006518337409943342, + "learning_rate": 2.8950842491398358e-06, + "loss": 0.2122, + "num_tokens": 2609600.0, + "reward": 0.3187499940395355, + "reward_std": 0.5568737983703613, + "rewards/reward_func/mean": 0.3187499940395355, + "rewards/reward_func/std": 0.5387137532234192, + "sampling/importance_sampling_ratio/max": 1.8600863218307495, + "sampling/importance_sampling_ratio/mean": 1.2007018327713013, + "sampling/importance_sampling_ratio/min": 0.5022804141044617, + "sampling/sampling_logp_difference/max": 0.36564385890960693, + "sampling/sampling_logp_difference/mean": 0.01762022264301777, + "step": 467, + "step_time": 37.397101591996034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 52.25, + "completions/mean_terminated_length": 52.25, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.34476569294929504, + "epoch": 0.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1792073249816895, + "kl": 0.008605368435382843, + "learning_rate": 2.8870870504834497e-06, + "loss": -0.2764, + "num_tokens": 2615093.0, + "reward": 0.3112500011920929, + "reward_std": 0.5650759339332581, + "rewards/reward_func/mean": 0.3112500011920929, + "rewards/reward_func/std": 0.5401173830032349, + "sampling/importance_sampling_ratio/max": 1.6590116024017334, + "sampling/importance_sampling_ratio/mean": 0.9717092514038086, + "sampling/importance_sampling_ratio/min": 0.4958648681640625, + "sampling/sampling_logp_difference/max": 0.3287513256072998, + "sampling/sampling_logp_difference/mean": 0.018109293654561043, + "step": 468, + "step_time": 38.83392319300037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 55.875, + "completions/mean_terminated_length": 55.875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.36447829008102417, + "epoch": 0.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4201726913452148, + "kl": 0.006331034004688263, + "learning_rate": 2.87908579146707e-06, + "loss": -0.1119, + "num_tokens": 2620537.0, + "reward": 0.08874999731779099, + "reward_std": 0.2768441140651703, + "rewards/reward_func/mean": 0.08874999731779099, + "rewards/reward_func/std": 0.37169256806373596, + "sampling/importance_sampling_ratio/max": 1.8191684484481812, + "sampling/importance_sampling_ratio/mean": 1.2801835536956787, + "sampling/importance_sampling_ratio/min": 0.562208890914917, + "sampling/sampling_logp_difference/max": 0.34848713874816895, + "sampling/sampling_logp_difference/mean": 0.021121980622410774, + "step": 469, + "step_time": 33.988550513007795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 48.125, + "completions/mean_terminated_length": 48.125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.34533941745758057, + "epoch": 0.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.524201512336731, + "kl": 0.01087350957095623, + "learning_rate": 2.8710805560201184e-06, + "loss": 0.1765, + "num_tokens": 2626187.0, + "reward": 0.47749999165534973, + "reward_std": 0.5234072804450989, + "rewards/reward_func/mean": 0.47749999165534973, + "rewards/reward_func/std": 0.5489405989646912, + "sampling/importance_sampling_ratio/max": 1.834068775177002, + "sampling/importance_sampling_ratio/mean": 0.9888299703598022, + "sampling/importance_sampling_ratio/min": 0.19429278373718262, + "sampling/sampling_logp_difference/max": 0.3557319641113281, + "sampling/sampling_logp_difference/mean": 0.02235410362482071, + "step": 470, + "step_time": 42.12490658200113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3179677724838257, + "epoch": 0.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6474220752716064, + "kl": 0.008520292118191719, + "learning_rate": 2.8630714281137263e-06, + "loss": 0.4652, + "num_tokens": 2632352.0, + "reward": 0.3037499785423279, + "reward_std": 0.2923581600189209, + "rewards/reward_func/mean": 0.3037499785423279, + "rewards/reward_func/std": 0.5454732775688171, + "sampling/importance_sampling_ratio/max": 2.8816580772399902, + "sampling/importance_sampling_ratio/mean": 1.172086238861084, + "sampling/importance_sampling_ratio/min": 0.4828004837036133, + "sampling/sampling_logp_difference/max": 0.3818354606628418, + "sampling/sampling_logp_difference/mean": 0.026590559631586075, + "step": 471, + "step_time": 44.42633706900233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 53.125, + "completions/mean_terminated_length": 53.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.35466456413269043, + "epoch": 0.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0142226219177246, + "kl": 0.006118321791291237, + "learning_rate": 2.8550584917598558e-06, + "loss": -0.1626, + "num_tokens": 2638591.0, + "reward": 0.08874999731779099, + "reward_std": 0.28442591428756714, + "rewards/reward_func/mean": 0.08874999731779099, + "rewards/reward_func/std": 0.3698045015335083, + "sampling/importance_sampling_ratio/max": 1.8325932025909424, + "sampling/importance_sampling_ratio/mean": 1.0203707218170166, + "sampling/importance_sampling_ratio/min": 0.5196402668952942, + "sampling/sampling_logp_difference/max": 0.3019367456436157, + "sampling/sampling_logp_difference/mean": 0.020309556275606155, + "step": 472, + "step_time": 35.4328885779978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.0, + "completions/mean_terminated_length": 52.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.33365398645401, + "epoch": 0.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9207823276519775, + "kl": 0.007777152117341757, + "learning_rate": 2.8470418310104175e-06, + "loss": 0.0079, + "num_tokens": 2644340.0, + "reward": -0.0625, + "reward_std": 0.046318307518959045, + "rewards/reward_func/mean": -0.0625, + "rewards/reward_func/std": 0.04334248974919319, + "sampling/importance_sampling_ratio/max": 1.521376609802246, + "sampling/importance_sampling_ratio/mean": 0.9468077421188354, + "sampling/importance_sampling_ratio/min": 0.7389934659004211, + "sampling/sampling_logp_difference/max": 0.35080957412719727, + "sampling/sampling_logp_difference/mean": 0.024117249995470047, + "step": 473, + "step_time": 40.55488790899108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 55.25, + "completions/mean_terminated_length": 55.25, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.38789480924606323, + "epoch": 0.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0140682458877563, + "kl": 0.00960574857890606, + "learning_rate": 2.839021529956388e-06, + "loss": 0.2122, + "num_tokens": 2650117.0, + "reward": 0.20624999701976776, + "reward_std": 0.3101291358470917, + "rewards/reward_func/mean": 0.20624999701976776, + "rewards/reward_func/std": 0.47850772738456726, + "sampling/importance_sampling_ratio/max": 1.7706210613250732, + "sampling/importance_sampling_ratio/mean": 0.8846872448921204, + "sampling/importance_sampling_ratio/min": 0.47399798035621643, + "sampling/sampling_logp_difference/max": 0.4928736686706543, + "sampling/sampling_logp_difference/mean": 0.02432020753622055, + "step": 474, + "step_time": 47.36277217399038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 51.75, + "completions/mean_terminated_length": 51.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.3783317804336548, + "epoch": 0.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0589309930801392, + "kl": 0.00788644514977932, + "learning_rate": 2.8309976727269335e-06, + "loss": 0.2562, + "num_tokens": 2655768.0, + "reward": 0.057499997317790985, + "reward_std": 0.2759822607040405, + "rewards/reward_func/mean": 0.057499997317790985, + "rewards/reward_func/std": 0.3830609917640686, + "sampling/importance_sampling_ratio/max": 1.4918638467788696, + "sampling/importance_sampling_ratio/mean": 0.9599424600601196, + "sampling/importance_sampling_ratio/min": 0.4261733293533325, + "sampling/sampling_logp_difference/max": 0.47033143043518066, + "sampling/sampling_logp_difference/mean": 0.021787922829389572, + "step": 475, + "step_time": 40.27885495001101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.339898943901062, + "epoch": 0.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.971348226070404, + "kl": 0.00876643043011427, + "learning_rate": 2.8229703434885165e-06, + "loss": -0.0228, + "num_tokens": 2661491.0, + "reward": 0.44875001907348633, + "reward_std": 0.6022260189056396, + "rewards/reward_func/mean": 0.44875001907348633, + "rewards/reward_func/std": 0.5575568675994873, + "sampling/importance_sampling_ratio/max": 1.1516081094741821, + "sampling/importance_sampling_ratio/mean": 0.9316661357879639, + "sampling/importance_sampling_ratio/min": 0.5835716724395752, + "sampling/sampling_logp_difference/max": 0.4196091890335083, + "sampling/sampling_logp_difference/mean": 0.021482866257429123, + "step": 476, + "step_time": 32.179777643003035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.33622926473617554, + "epoch": 0.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.885502815246582, + "kl": 0.006615090649574995, + "learning_rate": 2.814939626444023e-06, + "loss": 0.072, + "num_tokens": 2666869.0, + "reward": 0.07125000655651093, + "reward_std": 0.28238025307655334, + "rewards/reward_func/mean": 0.07125000655651093, + "rewards/reward_func/std": 0.36906978487968445, + "sampling/importance_sampling_ratio/max": 1.387723684310913, + "sampling/importance_sampling_ratio/mean": 0.8648278117179871, + "sampling/importance_sampling_ratio/min": 0.5371811985969543, + "sampling/sampling_logp_difference/max": 0.3530764579772949, + "sampling/sampling_logp_difference/mean": 0.025837857276201248, + "step": 477, + "step_time": 35.64491564700438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3508720397949219, + "epoch": 0.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0213587284088135, + "kl": 0.006008337251842022, + "learning_rate": 2.8069056058318754e-06, + "loss": -0.0749, + "num_tokens": 2672914.0, + "reward": 0.19750000536441803, + "reward_std": 0.33999553322792053, + "rewards/reward_func/mean": 0.19750000536441803, + "rewards/reward_func/std": 0.49813222885131836, + "sampling/importance_sampling_ratio/max": 1.4768438339233398, + "sampling/importance_sampling_ratio/mean": 0.9648414850234985, + "sampling/importance_sampling_ratio/min": 0.6780772805213928, + "sampling/sampling_logp_difference/max": 0.6728348731994629, + "sampling/sampling_logp_difference/mean": 0.021531637758016586, + "step": 478, + "step_time": 44.70471254599397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 45.625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.332109659910202, + "epoch": 0.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0038514137268066, + "kl": 0.00846945308148861, + "learning_rate": 2.7988683659251475e-06, + "loss": 0.1922, + "num_tokens": 2678156.0, + "reward": -0.04124999791383743, + "reward_std": 0.022555213421583176, + "rewards/reward_func/mean": -0.04124999791383743, + "rewards/reward_func/std": 0.024164613336324692, + "sampling/importance_sampling_ratio/max": 1.750364065170288, + "sampling/importance_sampling_ratio/mean": 1.0368093252182007, + "sampling/importance_sampling_ratio/min": 0.6043331027030945, + "sampling/sampling_logp_difference/max": 0.3436398506164551, + "sampling/sampling_logp_difference/mean": 0.02278842404484749, + "step": 479, + "step_time": 35.97302589699393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 55.75, + "completions/mean_terminated_length": 55.75, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.3682764768600464, + "epoch": 0.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2075387239456177, + "kl": 0.005664853844791651, + "learning_rate": 2.7908279910306834e-06, + "loss": -0.0507, + "num_tokens": 2684319.0, + "reward": -0.03999999910593033, + "reward_std": 0.03759898990392685, + "rewards/reward_func/mean": -0.03999999910593033, + "rewards/reward_func/std": 0.03545621037483215, + "sampling/importance_sampling_ratio/max": 1.4793400764465332, + "sampling/importance_sampling_ratio/mean": 0.949470043182373, + "sampling/importance_sampling_ratio/min": 0.7095850110054016, + "sampling/sampling_logp_difference/max": 0.7661590576171875, + "sampling/sampling_logp_difference/mean": 0.022031132131814957, + "step": 480, + "step_time": 48.977692065993324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3368332087993622, + "epoch": 0.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0318005084991455, + "kl": 0.011773078702390194, + "learning_rate": 2.7827845654882112e-06, + "loss": -0.211, + "num_tokens": 2689871.0, + "reward": 0.03875000774860382, + "reward_std": 0.3096115291118622, + "rewards/reward_func/mean": 0.03875000774860382, + "rewards/reward_func/std": 0.39116814732551575, + "sampling/importance_sampling_ratio/max": 1.4918369054794312, + "sampling/importance_sampling_ratio/mean": 0.9156639575958252, + "sampling/importance_sampling_ratio/min": 0.4481692910194397, + "sampling/sampling_logp_difference/max": 0.3614964485168457, + "sampling/sampling_logp_difference/mean": 0.020262327045202255, + "step": 481, + "step_time": 47.9836272290122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.3230361342430115, + "epoch": 0.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.032042384147644, + "kl": 0.0072667477652430534, + "learning_rate": 2.7747381736694573e-06, + "loss": 0.133, + "num_tokens": 2694882.0, + "reward": 0.45750004053115845, + "reward_std": 0.5040745139122009, + "rewards/reward_func/mean": 0.45750004053115845, + "rewards/reward_func/std": 0.5405222773551941, + "sampling/importance_sampling_ratio/max": 2.2135207653045654, + "sampling/importance_sampling_ratio/mean": 0.9904155135154724, + "sampling/importance_sampling_ratio/min": 0.5763868689537048, + "sampling/sampling_logp_difference/max": 0.2665048837661743, + "sampling/sampling_logp_difference/mean": 0.018102280795574188, + "step": 482, + "step_time": 34.09285955999803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.3808034658432007, + "epoch": 0.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3194098472595215, + "kl": 0.006681244820356369, + "learning_rate": 2.766688899977266e-06, + "loss": 0.0493, + "num_tokens": 2700228.0, + "reward": -0.06499999761581421, + "reward_std": 0.03923674300312996, + "rewards/reward_func/mean": -0.06499999761581421, + "rewards/reward_func/std": 0.03927921876311302, + "sampling/importance_sampling_ratio/max": 1.665113925933838, + "sampling/importance_sampling_ratio/mean": 1.090261459350586, + "sampling/importance_sampling_ratio/min": 0.6695654392242432, + "sampling/sampling_logp_difference/max": 0.3100537061691284, + "sampling/sampling_logp_difference/mean": 0.02159346640110016, + "step": 483, + "step_time": 43.91810172899568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 51.5, + "completions/mean_terminated_length": 51.5, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3179956078529358, + "epoch": 0.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6254982352256775, + "kl": 0.007635701447725296, + "learning_rate": 2.7586368288447094e-06, + "loss": -0.1305, + "num_tokens": 2705767.0, + "reward": 0.19750000536441803, + "reward_std": 0.350005179643631, + "rewards/reward_func/mean": 0.19750000536441803, + "rewards/reward_func/std": 0.4900656044483185, + "sampling/importance_sampling_ratio/max": 1.0408016443252563, + "sampling/importance_sampling_ratio/mean": 0.8733296990394592, + "sampling/importance_sampling_ratio/min": 0.4849831461906433, + "sampling/sampling_logp_difference/max": 0.32492589950561523, + "sampling/sampling_logp_difference/mean": 0.01731083169579506, + "step": 484, + "step_time": 43.16735061899817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.32685625553131104, + "epoch": 0.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9807037115097046, + "kl": 0.009544767439365387, + "learning_rate": 2.750582044734203e-06, + "loss": 0.1304, + "num_tokens": 2711361.0, + "reward": 0.05624999478459358, + "reward_std": 0.28626444935798645, + "rewards/reward_func/mean": 0.05624999478459358, + "rewards/reward_func/std": 0.38250818848609924, + "sampling/importance_sampling_ratio/max": 1.4526336193084717, + "sampling/importance_sampling_ratio/mean": 1.1034681797027588, + "sampling/importance_sampling_ratio/min": 0.822212815284729, + "sampling/sampling_logp_difference/max": 0.3479280471801758, + "sampling/sampling_logp_difference/mean": 0.019866865128278732, + "step": 485, + "step_time": 44.08477608699468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 48.625, + "completions/mean_terminated_length": 48.625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.34071382880210876, + "epoch": 0.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1414618492126465, + "kl": 0.008361655287444592, + "learning_rate": 2.7425246321366205e-06, + "loss": -0.0366, + "num_tokens": 2717584.0, + "reward": 0.1925000101327896, + "reward_std": 0.32843348383903503, + "rewards/reward_func/mean": 0.1925000101327896, + "rewards/reward_func/std": 0.49268218874931335, + "sampling/importance_sampling_ratio/max": 1.5116984844207764, + "sampling/importance_sampling_ratio/mean": 1.003057599067688, + "sampling/importance_sampling_ratio/min": 0.5172320604324341, + "sampling/sampling_logp_difference/max": 0.426800012588501, + "sampling/sampling_logp_difference/mean": 0.019641123712062836, + "step": 486, + "step_time": 47.363303733000066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 52.625, + "completions/mean_terminated_length": 52.625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.33557814359664917, + "epoch": 0.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0366374254226685, + "kl": 0.006426879204809666, + "learning_rate": 2.7344646755704078e-06, + "loss": 0.0267, + "num_tokens": 2723150.0, + "reward": 0.23000001907348633, + "reward_std": 0.5069743394851685, + "rewards/reward_func/mean": 0.23000001907348633, + "rewards/reward_func/std": 0.46940696239471436, + "sampling/importance_sampling_ratio/max": 1.188085675239563, + "sampling/importance_sampling_ratio/mean": 0.9919371604919434, + "sampling/importance_sampling_ratio/min": 0.6119289994239807, + "sampling/sampling_logp_difference/max": 0.3435969352722168, + "sampling/sampling_logp_difference/mean": 0.021727345883846283, + "step": 487, + "step_time": 35.52652668301016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.41280990839004517, + "epoch": 0.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0274547338485718, + "kl": 0.008126229047775269, + "learning_rate": 2.726402259580695e-06, + "loss": -0.1581, + "num_tokens": 2728807.0, + "reward": 0.1850000023841858, + "reward_std": 0.32565683126449585, + "rewards/reward_func/mean": 0.1850000023841858, + "rewards/reward_func/std": 0.47698459029197693, + "sampling/importance_sampling_ratio/max": 1.506007194519043, + "sampling/importance_sampling_ratio/mean": 1.029561996459961, + "sampling/importance_sampling_ratio/min": 0.42956024408340454, + "sampling/sampling_logp_difference/max": 1.2129210233688354, + "sampling/sampling_logp_difference/mean": 0.024751894176006317, + "step": 488, + "step_time": 47.60045914600778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 51.125, + "completions/mean_terminated_length": 51.125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.3394836187362671, + "epoch": 0.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.06610107421875, + "kl": 0.008799143135547638, + "learning_rate": 2.71833746873841e-06, + "loss": 0.077, + "num_tokens": 2733871.0, + "reward": 0.14749997854232788, + "reward_std": 0.35974743962287903, + "rewards/reward_func/mean": 0.14749997854232788, + "rewards/reward_func/std": 0.4975583255290985, + "sampling/importance_sampling_ratio/max": 1.507678747177124, + "sampling/importance_sampling_ratio/mean": 0.9904993772506714, + "sampling/importance_sampling_ratio/min": 0.5135411024093628, + "sampling/sampling_logp_difference/max": 0.5306490659713745, + "sampling/sampling_logp_difference/mean": 0.02448309026658535, + "step": 489, + "step_time": 48.45194135600468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 52.5, + "completions/mean_terminated_length": 52.5, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.33108896017074585, + "epoch": 0.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3293511867523193, + "kl": 0.009081996977329254, + "learning_rate": 2.7102703876393942e-06, + "loss": 0.2769, + "num_tokens": 2738881.0, + "reward": 0.33375000953674316, + "reward_std": 0.5720977783203125, + "rewards/reward_func/mean": 0.33375000953674316, + "rewards/reward_func/std": 0.5522664189338684, + "sampling/importance_sampling_ratio/max": 1.976300835609436, + "sampling/importance_sampling_ratio/mean": 1.1032793521881104, + "sampling/importance_sampling_ratio/min": 0.6139009594917297, + "sampling/sampling_logp_difference/max": 0.35714292526245117, + "sampling/sampling_logp_difference/mean": 0.02184763178229332, + "step": 490, + "step_time": 31.03724041300302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 54.125, + "completions/mean_terminated_length": 54.125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.3556636571884155, + "epoch": 0.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3523696660995483, + "kl": 0.006467908620834351, + "learning_rate": 2.702201100903511e-06, + "loss": -0.1537, + "num_tokens": 2745030.0, + "reward": 0.35624998807907104, + "reward_std": 0.5549872517585754, + "rewards/reward_func/mean": 0.35624998807907104, + "rewards/reward_func/std": 0.5310620069503784, + "sampling/importance_sampling_ratio/max": 2.1493470668792725, + "sampling/importance_sampling_ratio/mean": 1.0585157871246338, + "sampling/importance_sampling_ratio/min": 0.4366893470287323, + "sampling/sampling_logp_difference/max": 0.32509946823120117, + "sampling/sampling_logp_difference/mean": 0.020754611119627953, + "step": 491, + "step_time": 38.667722318990855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 41.875, + "completions/mean_terminated_length": 41.875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.32912951707839966, + "epoch": 0.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4372652769088745, + "kl": 0.010952703654766083, + "learning_rate": 2.694129693173759e-06, + "loss": 0.2391, + "num_tokens": 2750228.0, + "reward": 0.4750000238418579, + "reward_std": 0.5136302709579468, + "rewards/reward_func/mean": 0.4750000238418579, + "rewards/reward_func/std": 0.5536631345748901, + "sampling/importance_sampling_ratio/max": 2.09639310836792, + "sampling/importance_sampling_ratio/mean": 1.0164697170257568, + "sampling/importance_sampling_ratio/min": 0.6715074181556702, + "sampling/sampling_logp_difference/max": 0.34882307052612305, + "sampling/sampling_logp_difference/mean": 0.021729137748479843, + "step": 492, + "step_time": 27.27960569599236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3119977116584778, + "epoch": 0.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0905507802963257, + "kl": 0.006891919299960136, + "learning_rate": 2.6860562491153854e-06, + "loss": 0.0112, + "num_tokens": 2755740.0, + "reward": 0.3087500035762787, + "reward_std": 0.25716888904571533, + "rewards/reward_func/mean": 0.3087500035762787, + "rewards/reward_func/std": 0.4853404462337494, + "sampling/importance_sampling_ratio/max": 1.7588069438934326, + "sampling/importance_sampling_ratio/mean": 1.2490370273590088, + "sampling/importance_sampling_ratio/min": 0.9198641180992126, + "sampling/sampling_logp_difference/max": 0.29522716999053955, + "sampling/sampling_logp_difference/mean": 0.01567711867392063, + "step": 493, + "step_time": 37.00472452600661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 47.625, + "completions/mean_terminated_length": 47.625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3635869324207306, + "epoch": 0.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.052552342414856, + "kl": 0.013993756845593452, + "learning_rate": 2.6779808534149986e-06, + "loss": -0.154, + "num_tokens": 2761592.0, + "reward": 0.32875001430511475, + "reward_std": 0.5612891316413879, + "rewards/reward_func/mean": 0.32875001430511475, + "rewards/reward_func/std": 0.5381035208702087, + "sampling/importance_sampling_ratio/max": 1.603551983833313, + "sampling/importance_sampling_ratio/mean": 0.9373511075973511, + "sampling/importance_sampling_ratio/min": 0.4633772671222687, + "sampling/sampling_logp_difference/max": 0.5257512331008911, + "sampling/sampling_logp_difference/mean": 0.02118011564016342, + "step": 494, + "step_time": 39.871601100996486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.36310869455337524, + "epoch": 0.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6600749492645264, + "kl": 0.015285097993910313, + "learning_rate": 2.6699035907796796e-06, + "loss": 0.1971, + "num_tokens": 2767247.0, + "reward": 0.08749999850988388, + "reward_std": 0.2818909287452698, + "rewards/reward_func/mean": 0.08749999850988388, + "rewards/reward_func/std": 0.3707424998283386, + "sampling/importance_sampling_ratio/max": 2.1048922538757324, + "sampling/importance_sampling_ratio/mean": 0.9266895055770874, + "sampling/importance_sampling_ratio/min": 0.27929919958114624, + "sampling/sampling_logp_difference/max": 1.3129551410675049, + "sampling/sampling_logp_difference/mean": 0.03234206885099411, + "step": 495, + "step_time": 41.51073724999151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 56.0, + "completions/mean_terminated_length": 56.0, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.3388064503669739, + "epoch": 0.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.291845679283142, + "kl": 0.00741872563958168, + "learning_rate": 2.6618245459360896e-06, + "loss": -0.0175, + "num_tokens": 2772579.0, + "reward": 0.46875, + "reward_std": 0.5083686113357544, + "rewards/reward_func/mean": 0.46875, + "rewards/reward_func/std": 0.5500503182411194, + "sampling/importance_sampling_ratio/max": 2.055424451828003, + "sampling/importance_sampling_ratio/mean": 1.1403716802597046, + "sampling/importance_sampling_ratio/min": 0.5961279273033142, + "sampling/sampling_logp_difference/max": 0.36696600914001465, + "sampling/sampling_logp_difference/mean": 0.02003820426762104, + "step": 496, + "step_time": 29.742875528987497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.3439745306968689, + "epoch": 0.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8372857570648193, + "kl": 0.004724285565316677, + "learning_rate": 2.6537438036295876e-06, + "loss": -0.1035, + "num_tokens": 2778589.0, + "reward": 0.5837500095367432, + "reward_std": 0.5680229663848877, + "rewards/reward_func/mean": 0.5837500095367432, + "rewards/reward_func/std": 0.5513085126876831, + "sampling/importance_sampling_ratio/max": 1.4481302499771118, + "sampling/importance_sampling_ratio/mean": 0.8684442043304443, + "sampling/importance_sampling_ratio/min": 0.5821914672851562, + "sampling/sampling_logp_difference/max": 0.3571445345878601, + "sampling/sampling_logp_difference/mean": 0.019016366451978683, + "step": 497, + "step_time": 33.911930337999365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 49.625, + "completions/mean_terminated_length": 49.625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.3264671564102173, + "epoch": 0.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3532819747924805, + "kl": 0.009917155839502811, + "learning_rate": 2.6456614486233344e-06, + "loss": 0.0878, + "num_tokens": 2783355.0, + "reward": 0.3425000309944153, + "reward_std": 0.5395705103874207, + "rewards/reward_func/mean": 0.3425000309944153, + "rewards/reward_func/std": 0.5232794880867004, + "sampling/importance_sampling_ratio/max": 1.4326062202453613, + "sampling/importance_sampling_ratio/mean": 0.7578821182250977, + "sampling/importance_sampling_ratio/min": 0.3315609395503998, + "sampling/sampling_logp_difference/max": 0.3385963439941406, + "sampling/sampling_logp_difference/mean": 0.0261401254683733, + "step": 498, + "step_time": 29.913790080012404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 56.375, + "completions/mean_terminated_length": 56.375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.33865243196487427, + "epoch": 0.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7589961886405945, + "kl": 0.008822058327496052, + "learning_rate": 2.6375775656974124e-06, + "loss": 0.093, + "num_tokens": 2788575.0, + "reward": 0.027500003576278687, + "reward_std": 0.31530144810676575, + "rewards/reward_func/mean": 0.027500003576278687, + "rewards/reward_func/std": 0.3917998969554901, + "sampling/importance_sampling_ratio/max": 1.506130337715149, + "sampling/importance_sampling_ratio/mean": 0.8838839530944824, + "sampling/importance_sampling_ratio/min": 0.4121124744415283, + "sampling/sampling_logp_difference/max": 0.433666467666626, + "sampling/sampling_logp_difference/mean": 0.02140984870493412, + "step": 499, + "step_time": 41.98196549799468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.33830559253692627, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.053360939025879, + "kl": 0.007612613961100578, + "learning_rate": 2.6294922396479263e-06, + "loss": -0.0695, + "num_tokens": 2794184.0, + "reward": 0.33249998092651367, + "reward_std": 0.5619164705276489, + "rewards/reward_func/mean": 0.33249998092651367, + "rewards/reward_func/std": 0.5373945832252502, + "sampling/importance_sampling_ratio/max": 1.1783527135849, + "sampling/importance_sampling_ratio/mean": 0.699951171875, + "sampling/importance_sampling_ratio/min": 0.35024145245552063, + "sampling/sampling_logp_difference/max": 0.5603160858154297, + "sampling/sampling_logp_difference/mean": 0.027539845556020737, + "step": 500, + "step_time": 38.46154696500162 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 2794184, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}